diff --git a/.gitignore b/.gitignore index b0d27dd..fce85ca 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,10 @@ memory/L4_raw_sessions/* # Keychain !memory/keychain.py +# Exa search skill +!memory/exa_search.py +!memory/exa_search_sop.md + # Vision / OCR / UI detection tools !memory/ocr_utils.py !memory/vision_sop.md diff --git a/memory/exa_search.py b/memory/exa_search.py new file mode 100644 index 0000000..c3bc611 --- /dev/null +++ b/memory/exa_search.py @@ -0,0 +1,179 @@ +"""Exa web search skill. `from exa_search import search; search("query")`. Needs `pip install exa-py` and EXA_API_KEY (env or `keys.exa_api_key`).""" +import os +from dataclasses import dataclass, field +from typing import Any, List, Optional + +_INTEGRATION_TAG = "generic-agent" +_client = None + + +def _load_api_key() -> str: + key = os.environ.get("EXA_API_KEY") + if key: + return key + # Fall back to local keychain (memory/keychain.py) so users don't have to export env vars + try: + import keychain + return keychain.keys.exa_api_key.use() + except Exception: + pass + raise RuntimeError( + "EXA_API_KEY not set. Either `export EXA_API_KEY=...` or run " + "`from keychain import keys; keys.set('exa_api_key', 'sk-...')` once." + ) + + +def _get_client(): + """Lazy singleton. Sets the x-exa-integration header so Exa can attribute usage.""" + global _client + if _client is not None: + return _client + try: + from exa_py import Exa + except ImportError as e: + raise ImportError("exa-py not installed. Run `pip install exa-py`.") from e + c = Exa(_load_api_key()) + # Usage attribution. Safe if the SDK ever drops the attribute: falls through to the except. + try: + c.headers["x-exa-integration"] = _INTEGRATION_TAG + except Exception: + pass + _client = c + return c + + +@dataclass +class ExaResult: + """Typed wrapper for one Exa result. `snippet` cascades through highlights/summary/text.""" + title: str + url: str + snippet: str + published_date: Optional[str] = None + author: Optional[str] = None + score: Optional[float] = None + highlights: List[str] = field(default_factory=list) + summary: Optional[str] = None + text: Optional[str] = None + + def __repr__(self): + s = self.snippet + if len(s) > 120: + s = s[:117] + "..." + return f"ExaResult(url={self.url!r}, snippet={s!r})" + + +def _extract_snippet(r) -> str: + """Pick the best short preview: highlights → summary → text. Any may be missing.""" + hs = getattr(r, "highlights", None) or [] + if hs: + joined = " ".join(h for h in hs if h) + if joined: + return joined[:500] + summ = getattr(r, "summary", None) + if summ: + return summ[:500] + txt = getattr(r, "text", None) + if txt: + return txt[:500] + return "" + + +def _convert(r) -> ExaResult: + return ExaResult( + title=getattr(r, "title", None) or "", + url=getattr(r, "url", None) or "", + snippet=_extract_snippet(r), + published_date=getattr(r, "published_date", None), + author=getattr(r, "author", None), + score=getattr(r, "score", None), + highlights=list(getattr(r, "highlights", None) or []), + summary=getattr(r, "summary", None), + text=getattr(r, "text", None), + ) + + +def search( + query: str, + *, + num_results: int = 10, + search_type: str = "auto", + category: Optional[str] = None, + include_domains: Optional[List[str]] = None, + exclude_domains: Optional[List[str]] = None, + include_text: Optional[List[str]] = None, + exclude_text: Optional[List[str]] = None, + start_published_date: Optional[str] = None, + end_published_date: Optional[str] = None, + text: Any = None, + highlights: Any = True, + summary: Any = None, +) -> List[ExaResult]: + """ + Semantic web search via Exa. Returns list[ExaResult]. + + search_type: 'auto' (default) | 'neural' | 'fast' | 'instant' | 'deep' | 'deep-lite' | 'deep-reasoning' + category: 'company' | 'research paper' | 'news' | 'personal site' | 'financial report' | 'people' + + Content flags (pass True, a dict like {'maxCharacters': 500}, or None to omit): + - highlights: short relevant excerpts (default: True, gives compact snippets) + - text: full page text (use sparingly — large payloads) + - summary: LLM-distilled summary, dict only, e.g. {'query': 'key findings'} + + Dates are ISO 8601 strings (e.g. '2025-01-01T00:00:00Z'). + """ + kwargs: dict = {"num_results": num_results, "type": search_type} + if category: kwargs["category"] = category + if include_domains: kwargs["include_domains"] = include_domains + if exclude_domains: kwargs["exclude_domains"] = exclude_domains + if include_text: kwargs["include_text"] = include_text + if exclude_text: kwargs["exclude_text"] = exclude_text + if start_published_date: kwargs["start_published_date"] = start_published_date + if end_published_date: kwargs["end_published_date"] = end_published_date + if text is not None: kwargs["text"] = text + if highlights is not None: kwargs["highlights"] = highlights + if summary is not None: kwargs["summary"] = summary + + resp = _get_client().search_and_contents(query, **kwargs) + return [_convert(r) for r in (getattr(resp, "results", None) or [])] + + +def find_similar( + url: str, + *, + num_results: int = 10, + highlights: Any = True, + text: Any = None, + summary: Any = None, +) -> List[ExaResult]: + """Find pages semantically similar to `url`. Same content flags as search().""" + kwargs: dict = {"num_results": num_results} + if highlights is not None: kwargs["highlights"] = highlights + if text is not None: kwargs["text"] = text + if summary is not None: kwargs["summary"] = summary + resp = _get_client().find_similar_and_contents(url, **kwargs) + return [_convert(r) for r in (getattr(resp, "results", None) or [])] + + +def get_contents( + urls: List[str], + *, + text: Any = True, + highlights: Any = None, + summary: Any = None, +) -> List[ExaResult]: + """Fetch page contents for known URLs (bypasses search).""" + kwargs: dict = {} + if text is not None: kwargs["text"] = text + if highlights is not None: kwargs["highlights"] = highlights + if summary is not None: kwargs["summary"] = summary + resp = _get_client().get_contents(urls, **kwargs) + return [_convert(r) for r in (getattr(resp, "results", None) or [])] + + +if __name__ == "__main__": + # CLI: python exa_search.py "" [num_results] + import sys + q = sys.argv[1] if len(sys.argv) > 1 else "latest LLM research" + n = int(sys.argv[2]) if len(sys.argv) > 2 else 5 + for r in search(q, num_results=n): + print(f"- {r.title}\n {r.url}\n {r.snippet[:200]}\n") diff --git a/memory/exa_search_sop.md b/memory/exa_search_sop.md new file mode 100644 index 0000000..be3c73f --- /dev/null +++ b/memory/exa_search_sop.md @@ -0,0 +1,123 @@ +# Exa Search SOP + +> Semantic web search with content retrieval. Use when `web_scan` / Google scraping is too noisy or when you need structured, typed results fast. + +**触发**:需要搜索高质量网页内容、研究论文、公司信息、新闻,或需要"语义相似页面" +**禁用**:只需要浏览器自动化(走 `web_scan` / `web_execute_js`);只需搜本地 skill 库(走 `skill_search`) + +## 一次性准备 + +```bash +pip install exa-py +``` + +配置 API Key(二选一): +```python +# 方式A:环境变量 +# export EXA_API_KEY=sk-xxx + +# 方式B:keychain(推荐,跨会话持久化) +import sys; sys.path.append('../memory') +from keychain import keys +keys.set('exa_api_key', 'sk-xxx') # 一次性,后续自动读取 +``` + +从 https://dashboard.exa.ai/api-keys 获取 key。 + +## 最简调用 + +```python +import sys; sys.path.append('../memory') +from exa_search import search + +results = search("state of the art retrieval augmented generation 2025") +for r in results: + print(f"- {r.title} ({r.url})") + print(f" {r.snippet[:200]}") +``` + +返回 `list[ExaResult]`,字段:`title / url / snippet / published_date / author / score / highlights / summary / text`。 +`snippet` 自动从 highlights → summary → text 级联提取,不用手动 fallback。 + +## API 签名 + +```python +search(query, *, + num_results=10, + search_type='auto', # 'auto'|'neural'|'fast'|'instant'|'deep'|'deep-lite'|'deep-reasoning' + category=None, # 'company'|'research paper'|'news'|'personal site'|'financial report'|'people' + include_domains=None, # list[str] + exclude_domains=None, + include_text=None, # list[str] 必含词 + exclude_text=None, + start_published_date=None, end_published_date=None, # ISO 8601 + text=None, # True/dict/None 完整正文 + highlights=True, # True/dict/None 高亮片段(默认开) + summary=None) # dict only, e.g. {'query':'key findings'} + +find_similar(url, *, num_results=10, highlights=True, text=None, summary=None) +get_contents(urls, *, text=True, highlights=None, summary=None) +``` + +## 典型场景 + +### 1. 研究论文检索 +```python +search("contrastive learning for dense retrieval", + category="research paper", num_results=20, + start_published_date="2024-01-01T00:00:00Z") +``` + +### 2. 公司尽调(限定域名) +```python +search("Anthropic funding rounds", + category="company", + include_domains=["crunchbase.com", "techcrunch.com"]) +``` + +### 3. 要完整正文 + 摘要 +```python +# 可同时拿三种内容,不是互斥 +search("Kimi K2 benchmarks", + text={"maxCharacters": 3000}, + highlights={"maxCharacters": 500, "query": "MMLU scores"}, + summary={"query": "benchmark results"}) +``` + +### 4. 已知 URL 拿正文(不走搜索) +```python +from exa_search import get_contents +[page] = get_contents(["https://arxiv.org/abs/2501.00001"]) +print(page.text) +``` + +### 5. 以图搜图式的语义近邻 +```python +from exa_search import find_similar +find_similar("https://openai.com/index/gpt-5/", num_results=8) +``` + +## CLI + +```bash +python ../memory/exa_search.py "agent frameworks 2025" 5 +``` + +## 避坑 + +- ⚠️ **Exa 不再有 `keyword` search type**,老文档里的 `type="keyword"` 会报错,用 `type="fast"` 或 `type="auto"` +- ⚠️ **不要 `text=True` + `num_results=50`**:每个结果 ~几 KB 正文,会直接炸 LLM 上下文。要批量召回先只开 `highlights`,再对感兴趣的 URL 调 `get_contents` +- ⚠️ **`summary` 参数必须是 dict**,传 `True` 会报错(与 `text`/`highlights` 不同) +- ⚠️ **日期要 ISO 8601**,不是 `"2024-01-01"`,要带 `T00:00:00Z` +- ⚠️ **首次导入会起一个 client 单例**,切换 key 要重启 Python 或 `import exa_search; exa_search._client = None` +- ⚠️ **网络失败不会自动重试**:在 autonomous flow 里建议包一层 try / 指数退避 +- ⚠️ **include_text / exclude_text 按词组匹配**,不是正则;每项建议 ≤5 词以免命中率为零 + +## 何时用 Exa vs 其他工具 + +| 场景 | 工具 | +|---|---| +| 需要高相关度的主题检索、论文、研究、公司信息 | **exa_search** | +| 要登录后的页面内容 / 浏览器会话中的交互 | `web_scan` + `web_execute_js` | +| Google 图搜、特定站点的爬取 | `web_scan`(走真实浏览器保留登录态) | +| 检索本地 105K 技能卡 | `skill_search` | diff --git a/tests/test_exa_search.py b/tests/test_exa_search.py new file mode 100644 index 0000000..b0b267c --- /dev/null +++ b/tests/test_exa_search.py @@ -0,0 +1,334 @@ +"""Unit tests for memory/exa_search.py. + +Covers: + - snippet fallback (highlights → summary → text → empty) + - client singleton + integration header + key loading + - search() kwargs wiring (types, filters, content flags) + - disabled state when EXA_API_KEY is missing and keychain has no entry +""" +import os +import sys +import types +import unittest +from unittest.mock import MagicMock, patch + +# Match the sys.path pattern used in other tests + the one skills themselves use. +_REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, _REPO_DIR) +sys.path.insert(0, os.path.join(_REPO_DIR, "memory")) + + +def _fresh_module(): + """Re-import exa_search with a cleared _client singleton so each test starts clean.""" + if "exa_search" in sys.modules: + del sys.modules["exa_search"] + import exa_search # noqa: E402 + return exa_search + + +class _FakeResult: + """Mimics one entry of exa_py's SearchResponse.results.""" + def __init__(self, **kw): + self.title = kw.get("title") + self.url = kw.get("url") + self.published_date = kw.get("published_date") + self.author = kw.get("author") + self.score = kw.get("score") + self.highlights = kw.get("highlights") + self.summary = kw.get("summary") + self.text = kw.get("text") + + +class _FakeResponse: + def __init__(self, results): + self.results = results + + +def _install_fake_exa_py(captured: dict): + """Install a fake exa_py module so `from exa_py import Exa` inside exa_search works.""" + fake_mod = types.ModuleType("exa_py") + + class FakeExa: + def __init__(self, api_key): + captured["api_key"] = api_key + captured["instance"] = self + self.headers = {} + + def search_and_contents(self, query, **kwargs): + captured["method"] = "search_and_contents" + captured["query"] = query + captured["kwargs"] = kwargs + return _FakeResponse(captured.get("return_results", [])) + + def find_similar_and_contents(self, url, **kwargs): + captured["method"] = "find_similar_and_contents" + captured["url"] = url + captured["kwargs"] = kwargs + return _FakeResponse(captured.get("return_results", [])) + + def get_contents(self, urls, **kwargs): + captured["method"] = "get_contents" + captured["urls"] = urls + captured["kwargs"] = kwargs + return _FakeResponse(captured.get("return_results", [])) + + fake_mod.Exa = FakeExa + sys.modules["exa_py"] = fake_mod + return fake_mod + + +class TestSnippetFallback(unittest.TestCase): + """_extract_snippet must cascade: highlights → summary → text → ''.""" + + def setUp(self): + self.mod = _fresh_module() + + def test_highlights_preferred(self): + r = _FakeResult(highlights=["alpha", "beta"], summary="summ", text="full") + self.assertEqual(self.mod._extract_snippet(r), "alpha beta") + + def test_summary_when_no_highlights(self): + r = _FakeResult(highlights=None, summary="summ", text="full") + self.assertEqual(self.mod._extract_snippet(r), "summ") + + def test_summary_when_highlights_empty_list(self): + r = _FakeResult(highlights=[], summary="summ", text="full") + self.assertEqual(self.mod._extract_snippet(r), "summ") + + def test_text_when_no_highlights_or_summary(self): + r = _FakeResult(text="full page text") + self.assertEqual(self.mod._extract_snippet(r), "full page text") + + def test_empty_when_all_missing(self): + r = _FakeResult() + self.assertEqual(self.mod._extract_snippet(r), "") + + def test_highlights_all_empty_strings_falls_through(self): + r = _FakeResult(highlights=["", ""], summary="summ") + # Joined empty strings → falsy → fall through to summary + self.assertEqual(self.mod._extract_snippet(r), "summ") + + def test_long_snippet_truncated(self): + r = _FakeResult(text="x" * 2000) + self.assertEqual(len(self.mod._extract_snippet(r)), 500) + + +class TestConvert(unittest.TestCase): + """_convert produces typed ExaResult with all fields.""" + + def setUp(self): + self.mod = _fresh_module() + + def test_full_conversion(self): + r = _FakeResult( + title="T", url="https://ex.com", published_date="2025-01-01", + author="A", score=0.9, highlights=["h1"], summary="s", text="t", + ) + out = self.mod._convert(r) + self.assertEqual(out.title, "T") + self.assertEqual(out.url, "https://ex.com") + self.assertEqual(out.snippet, "h1") + self.assertEqual(out.published_date, "2025-01-01") + self.assertEqual(out.author, "A") + self.assertEqual(out.score, 0.9) + self.assertEqual(out.highlights, ["h1"]) + self.assertEqual(out.summary, "s") + self.assertEqual(out.text, "t") + + def test_defaults_when_missing(self): + out = self.mod._convert(_FakeResult()) + self.assertEqual(out.title, "") + self.assertEqual(out.url, "") + self.assertEqual(out.snippet, "") + self.assertIsNone(out.published_date) + self.assertEqual(out.highlights, []) + + +class TestClientConstruction(unittest.TestCase): + """Client singleton sets integration header and reads EXA_API_KEY.""" + + def setUp(self): + self.captured: dict = {} + _install_fake_exa_py(self.captured) + self.mod = _fresh_module() + + def tearDown(self): + os.environ.pop("EXA_API_KEY", None) + sys.modules.pop("exa_py", None) + + def test_uses_env_var(self): + os.environ["EXA_API_KEY"] = "sk-env" + self.mod._client = None + c = self.mod._get_client() + self.assertEqual(self.captured["api_key"], "sk-env") + self.assertEqual(c.headers["x-exa-integration"], "generic-agent") + + def test_singleton_reused(self): + os.environ["EXA_API_KEY"] = "sk-env" + self.mod._client = None + c1 = self.mod._get_client() + c2 = self.mod._get_client() + self.assertIs(c1, c2) + + def test_missing_key_raises(self): + os.environ.pop("EXA_API_KEY", None) + self.mod._client = None + # Patch keychain lookup inside _load_api_key so we test the "no key anywhere" path + # regardless of whether memory/keychain.py has a real saved key locally. + with patch.object(self.mod, "_load_api_key", side_effect=RuntimeError("EXA_API_KEY not set.")): + with self.assertRaises(RuntimeError) as ctx: + self.mod._get_client() + self.assertIn("EXA_API_KEY", str(ctx.exception)) + + def test_missing_sdk_raises_helpful_error(self): + # Simulate exa-py not installed. + os.environ["EXA_API_KEY"] = "sk-env" + self.mod._client = None + sys.modules.pop("exa_py", None) + # With exa_py removed from sys.modules and no install on path, import fails. + # We guarantee that by patching builtins.__import__ for the "exa_py" name only. + import builtins + real_import = builtins.__import__ + + def fake_import(name, *a, **kw): + if name == "exa_py": + raise ImportError("No module named 'exa_py'") + return real_import(name, *a, **kw) + + with patch.object(builtins, "__import__", side_effect=fake_import): + with self.assertRaises(ImportError) as ctx: + self.mod._get_client() + self.assertIn("exa-py", str(ctx.exception)) + + +class TestSearchKwargsWiring(unittest.TestCase): + """search() must wire all kwargs to the SDK correctly.""" + + def setUp(self): + self.captured: dict = {} + _install_fake_exa_py(self.captured) + os.environ["EXA_API_KEY"] = "sk-env" + self.mod = _fresh_module() + self.mod._client = None + + def tearDown(self): + os.environ.pop("EXA_API_KEY", None) + sys.modules.pop("exa_py", None) + + def test_defaults(self): + self.captured["return_results"] = [] + self.mod.search("hello") + kw = self.captured["kwargs"] + self.assertEqual(self.captured["query"], "hello") + self.assertEqual(kw["type"], "auto") + self.assertEqual(kw["num_results"], 10) + self.assertEqual(kw["highlights"], True) + # None-valued args must NOT be passed through + self.assertNotIn("category", kw) + self.assertNotIn("text", kw) + self.assertNotIn("summary", kw) + self.assertNotIn("include_domains", kw) + + def test_all_filters_forwarded(self): + self.captured["return_results"] = [] + self.mod.search( + "q", + num_results=3, + search_type="neural", + category="research paper", + include_domains=["arxiv.org"], + exclude_domains=["reddit.com"], + include_text=["transformer"], + exclude_text=["ad"], + start_published_date="2025-01-01T00:00:00Z", + end_published_date="2025-12-31T00:00:00Z", + text={"maxCharacters": 500}, + highlights={"maxCharacters": 200, "query": "results"}, + summary={"query": "findings"}, + ) + kw = self.captured["kwargs"] + self.assertEqual(kw["type"], "neural") + self.assertEqual(kw["num_results"], 3) + self.assertEqual(kw["category"], "research paper") + self.assertEqual(kw["include_domains"], ["arxiv.org"]) + self.assertEqual(kw["exclude_domains"], ["reddit.com"]) + self.assertEqual(kw["include_text"], ["transformer"]) + self.assertEqual(kw["exclude_text"], ["ad"]) + self.assertEqual(kw["start_published_date"], "2025-01-01T00:00:00Z") + self.assertEqual(kw["end_published_date"], "2025-12-31T00:00:00Z") + self.assertEqual(kw["text"], {"maxCharacters": 500}) + self.assertEqual(kw["highlights"]["maxCharacters"], 200) + self.assertEqual(kw["summary"], {"query": "findings"}) + + def test_highlights_can_be_disabled(self): + self.captured["return_results"] = [] + self.mod.search("q", highlights=None) + self.assertNotIn("highlights", self.captured["kwargs"]) + + def test_content_types_combine(self): + """Text + highlights + summary must all be sent together; they're not mutually exclusive.""" + self.captured["return_results"] = [] + self.mod.search("q", text=True, highlights=True, summary={"query": "x"}) + kw = self.captured["kwargs"] + self.assertTrue(kw["text"]) + self.assertTrue(kw["highlights"]) + self.assertEqual(kw["summary"], {"query": "x"}) + + def test_returns_typed_results(self): + self.captured["return_results"] = [ + _FakeResult(title="T1", url="https://a", highlights=["snip1"]), + _FakeResult(title="T2", url="https://b", summary="snip2"), + ] + out = self.mod.search("q") + self.assertEqual(len(out), 2) + self.assertEqual(out[0].title, "T1") + self.assertEqual(out[0].snippet, "snip1") + self.assertEqual(out[1].snippet, "snip2") + self.assertEqual(out[1].title, "T2") + + def test_empty_results(self): + self.captured["return_results"] = [] + self.assertEqual(self.mod.search("q"), []) + + def test_missing_results_attr(self): + """If the SDK response has no `results` attr, return [].""" + class NoResults: pass + with patch.object(self.mod, "_get_client") as gc: + client = MagicMock() + client.search_and_contents.return_value = NoResults() + gc.return_value = client + self.assertEqual(self.mod.search("q"), []) + + +class TestFindSimilarAndGetContents(unittest.TestCase): + def setUp(self): + self.captured: dict = {} + _install_fake_exa_py(self.captured) + os.environ["EXA_API_KEY"] = "sk-env" + self.mod = _fresh_module() + self.mod._client = None + + def tearDown(self): + os.environ.pop("EXA_API_KEY", None) + sys.modules.pop("exa_py", None) + + def test_find_similar(self): + self.captured["return_results"] = [_FakeResult(title="sim", url="https://s", text="body")] + out = self.mod.find_similar("https://seed.com", num_results=5) + self.assertEqual(self.captured["method"], "find_similar_and_contents") + self.assertEqual(self.captured["url"], "https://seed.com") + self.assertEqual(self.captured["kwargs"]["num_results"], 5) + self.assertEqual(len(out), 1) + self.assertEqual(out[0].title, "sim") + + def test_get_contents_default_text_true(self): + self.captured["return_results"] = [_FakeResult(url="https://x", text="full text")] + out = self.mod.get_contents(["https://x"]) + self.assertEqual(self.captured["method"], "get_contents") + self.assertEqual(self.captured["urls"], ["https://x"]) + self.assertTrue(self.captured["kwargs"]["text"]) + self.assertEqual(out[0].text, "full text") + + +if __name__ == "__main__": + unittest.main()