lsdefine · tgonzalezc5 · Apr 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -76,6 +76,10 @@ memory/L4_raw_sessions/*
 # Keychain
 !memory/keychain.py
 
+# Exa search skill
+!memory/exa_search.py
+!memory/exa_search_sop.md
+
 # Vision / OCR / UI detection tools
 !memory/ocr_utils.py
 !memory/vision_sop.md

diff --git a/memory/exa_search.py b/memory/exa_search.py
@@ -0,0 +1,179 @@
+"""Exa web search skill. `from exa_search import search; search("query")`. Needs `pip install exa-py` and EXA_API_KEY (env or `keys.exa_api_key`)."""
+import os
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+
+_INTEGRATION_TAG = "generic-agent"
+_client = None
+
+
+def _load_api_key() -> str:
+    key = os.environ.get("EXA_API_KEY")
+    if key:
+        return key
+    # Fall back to local keychain (memory/keychain.py) so users don't have to export env vars
+    try:
+        import keychain
+        return keychain.keys.exa_api_key.use()
+    except Exception:
+        pass
+    raise RuntimeError(
+        "EXA_API_KEY not set. Either `export EXA_API_KEY=...` or run "
+        "`from keychain import keys; keys.set('exa_api_key', 'sk-...')` once."
+    )
+
+
+def _get_client():
+    """Lazy singleton. Sets the x-exa-integration header so Exa can attribute usage."""
+    global _client
+    if _client is not None:
+        return _client
+    try:
+        from exa_py import Exa
+    except ImportError as e:
+        raise ImportError("exa-py not installed. Run `pip install exa-py`.") from e
+    c = Exa(_load_api_key())
+    # Usage attribution. Safe if the SDK ever drops the attribute: falls through to the except.
+    try:
+        c.headers["x-exa-integration"] = _INTEGRATION_TAG
+    except Exception:
+        pass
+    _client = c
+    return c
+
+
+@dataclass
+class ExaResult:
+    """Typed wrapper for one Exa result. `snippet` cascades through highlights/summary/text."""
+    title: str
+    url: str
+    snippet: str
+    published_date: Optional[str] = None
+    author: Optional[str] = None
+    score: Optional[float] = None
+    highlights: List[str] = field(default_factory=list)
+    summary: Optional[str] = None
+    text: Optional[str] = None
+
+    def __repr__(self):
+        s = self.snippet
+        if len(s) > 120:
+            s = s[:117] + "..."
+        return f"ExaResult(url={self.url!r}, snippet={s!r})"
+
+
+def _extract_snippet(r) -> str:
+    """Pick the best short preview: highlights → summary → text. Any may be missing."""
+    hs = getattr(r, "highlights", None) or []
+    if hs:
+        joined = " ".join(h for h in hs if h)
+        if joined:
+            return joined[:500]
+    summ = getattr(r, "summary", None)
+    if summ:
+        return summ[:500]
+    txt = getattr(r, "text", None)
+    if txt:
+        return txt[:500]
+    return ""
+
+
+def _convert(r) -> ExaResult:
+    return ExaResult(
+        title=getattr(r, "title", None) or "",
+        url=getattr(r, "url", None) or "",
+        snippet=_extract_snippet(r),
+        published_date=getattr(r, "published_date", None),
+        author=getattr(r, "author", None),
+        score=getattr(r, "score", None),
+        highlights=list(getattr(r, "highlights", None) or []),
+        summary=getattr(r, "summary", None),
+        text=getattr(r, "text", None),
+    )
+
+
+def search(
+    query: str,
+    *,
+    num_results: int = 10,
+    search_type: str = "auto",
+    category: Optional[str] = None,
+    include_domains: Optional[List[str]] = None,
+    exclude_domains: Optional[List[str]] = None,
+    include_text: Optional[List[str]] = None,
+    exclude_text: Optional[List[str]] = None,
+    start_published_date: Optional[str] = None,
+    end_published_date: Optional[str] = None,
+    text: Any = None,
+    highlights: Any = True,
+    summary: Any = None,
+) -> List[ExaResult]:
+    """
+    Semantic web search via Exa. Returns list[ExaResult].
+
+    search_type: 'auto' (default) | 'neural' | 'fast' | 'instant' | 'deep' | 'deep-lite' | 'deep-reasoning'
+    category:    'company' | 'research paper' | 'news' | 'personal site' | 'financial report' | 'people'
+
+    Content flags (pass True, a dict like {'maxCharacters': 500}, or None to omit):
+      - highlights: short relevant excerpts (default: True, gives compact snippets)
+      - text: full page text (use sparingly — large payloads)
+      - summary: LLM-distilled summary, dict only, e.g. {'query': 'key findings'}
+
+    Dates are ISO 8601 strings (e.g. '2025-01-01T00:00:00Z').
+    """
+    kwargs: dict = {"num_results": num_results, "type": search_type}
+    if category:               kwargs["category"] = category
+    if include_domains:        kwargs["include_domains"] = include_domains
+    if exclude_domains:        kwargs["exclude_domains"] = exclude_domains
+    if include_text:           kwargs["include_text"] = include_text
+    if exclude_text:           kwargs["exclude_text"] = exclude_text
+    if start_published_date:   kwargs["start_published_date"] = start_published_date
+    if end_published_date:     kwargs["end_published_date"] = end_published_date
+    if text is not None:       kwargs["text"] = text
+    if highlights is not None: kwargs["highlights"] = highlights
+    if summary is not None:    kwargs["summary"] = summary
+
+    resp = _get_client().search_and_contents(query, **kwargs)
+    return [_convert(r) for r in (getattr(resp, "results", None) or [])]
+
+
+def find_similar(
+    url: str,
+    *,
+    num_results: int = 10,
+    highlights: Any = True,
+    text: Any = None,
+    summary: Any = None,
+) -> List[ExaResult]:
+    """Find pages semantically similar to `url`. Same content flags as search()."""
+    kwargs: dict = {"num_results": num_results}
+    if highlights is not None: kwargs["highlights"] = highlights
+    if text is not None:       kwargs["text"] = text
+    if summary is not None:    kwargs["summary"] = summary
+    resp = _get_client().find_similar_and_contents(url, **kwargs)
+    return [_convert(r) for r in (getattr(resp, "results", None) or [])]
+
+
+def get_contents(
+    urls: List[str],
+    *,
+    text: Any = True,
+    highlights: Any = None,
+    summary: Any = None,
+) -> List[ExaResult]:
+    """Fetch page contents for known URLs (bypasses search)."""
+    kwargs: dict = {}
+    if text is not None:       kwargs["text"] = text
+    if highlights is not None: kwargs["highlights"] = highlights
+    if summary is not None:    kwargs["summary"] = summary
+    resp = _get_client().get_contents(urls, **kwargs)
+    return [_convert(r) for r in (getattr(resp, "results", None) or [])]
+
+
+if __name__ == "__main__":
+    # CLI: python exa_search.py "<query>" [num_results]
+    import sys
+    q = sys.argv[1] if len(sys.argv) > 1 else "latest LLM research"
+    n = int(sys.argv[2]) if len(sys.argv) > 2 else 5
+    for r in search(q, num_results=n):
+        print(f"- {r.title}\n  {r.url}\n  {r.snippet[:200]}\n")
diff --git a/memory/exa_search_sop.md b/memory/exa_search_sop.md
@@ -0,0 +1,123 @@
+# Exa Search SOP
+
+> Semantic web search with content retrieval. Use when `web_scan` / Google scraping is too noisy or when you need structured, typed results fast.
+
+**触发**：需要搜索高质量网页内容、研究论文、公司信息、新闻，或需要"语义相似页面"
+**禁用**：只需要浏览器自动化（走 `web_scan` / `web_execute_js`）；只需搜本地 skill 库（走 `skill_search`）
+
+## 一次性准备
+
+```bash
+pip install exa-py
+```
+
+配置 API Key（二选一）：
+```python
+# 方式A：环境变量
+# export EXA_API_KEY=sk-xxx
+
+# 方式B：keychain（推荐，跨会话持久化）
+import sys; sys.path.append('../memory')
+from keychain import keys
+keys.set('exa_api_key', 'sk-xxx')  # 一次性，后续自动读取
+```
+
+从 https://dashboard.exa.ai/api-keys 获取 key。
+
+## 最简调用
+
+```python
+import sys; sys.path.append('../memory')
+from exa_search import search
+
+results = search("state of the art retrieval augmented generation 2025")
+for r in results:
+    print(f"- {r.title}  ({r.url})")
+    print(f"  {r.snippet[:200]}")
+```
+
+返回 `list[ExaResult]`，字段：`title / url / snippet / published_date / author / score / highlights / summary / text`。
+`snippet` 自动从 highlights → summary → text 级联提取，不用手动 fallback。
+
+## API 签名
+
+```python
+search(query, *,
+       num_results=10,
+       search_type='auto',       # 'auto'|'neural'|'fast'|'instant'|'deep'|'deep-lite'|'deep-reasoning'
+       category=None,            # 'company'|'research paper'|'news'|'personal site'|'financial report'|'people'
+       include_domains=None,     # list[str]
+       exclude_domains=None,
+       include_text=None,        # list[str] 必含词
+       exclude_text=None,
+       start_published_date=None, end_published_date=None,  # ISO 8601
+       text=None,                # True/dict/None  完整正文
+       highlights=True,          # True/dict/None  高亮片段（默认开）
+       summary=None)             # dict only, e.g. {'query':'key findings'}
+
+find_similar(url, *, num_results=10, highlights=True, text=None, summary=None)
+get_contents(urls, *, text=True, highlights=None, summary=None)
+```
+
+## 典型场景
+
+### 1. 研究论文检索
+```python
+search("contrastive learning for dense retrieval",
+       category="research paper", num_results=20,
+       start_published_date="2024-01-01T00:00:00Z")
+```
+
+### 2. 公司尽调（限定域名）
+```python
+search("Anthropic funding rounds",
+       category="company",
+       include_domains=["crunchbase.com", "techcrunch.com"])
+```
+
+### 3. 要完整正文 + 摘要
+```python
+# 可同时拿三种内容，不是互斥
+search("Kimi K2 benchmarks",
+       text={"maxCharacters": 3000},
+       highlights={"maxCharacters": 500, "query": "MMLU scores"},
+       summary={"query": "benchmark results"})
+```
+
+### 4. 已知 URL 拿正文（不走搜索）
+```python
+from exa_search import get_contents
+[page] = get_contents(["https://arxiv.org/abs/2501.00001"])
+print(page.text)
+```
+
+### 5. 以图搜图式的语义近邻
+```python
+from exa_search import find_similar
+find_similar("https://openai.com/index/gpt-5/", num_results=8)
+```
+
+## CLI
+
+```bash
+python ../memory/exa_search.py "agent frameworks 2025" 5
+```
+
+## 避坑
+
+- ⚠️ **Exa 不再有 `keyword` search type**，老文档里的 `type="keyword"` 会报错，用 `type="fast"` 或 `type="auto"`
+- ⚠️ **不要 `text=True` + `num_results=50`**：每个结果 ~几 KB 正文，会直接炸 LLM 上下文。要批量召回先只开 `highlights`，再对感兴趣的 URL 调 `get_contents`
+- ⚠️ **`summary` 参数必须是 dict**，传 `True` 会报错（与 `text`/`highlights` 不同）
+- ⚠️ **日期要 ISO 8601**，不是 `"2024-01-01"`，要带 `T00:00:00Z`
+- ⚠️ **首次导入会起一个 client 单例**，切换 key 要重启 Python 或 `import exa_search; exa_search._client = None`
+- ⚠️ **网络失败不会自动重试**：在 autonomous flow 里建议包一层 try / 指数退避
+- ⚠️ **include_text / exclude_text 按词组匹配**，不是正则；每项建议 ≤5 词以免命中率为零
+
+## 何时用 Exa vs 其他工具
+
+| 场景 | 工具 |
+|---|---|
+| 需要高相关度的主题检索、论文、研究、公司信息 | **exa_search** |
+| 要登录后的页面内容 / 浏览器会话中的交互 | `web_scan` + `web_execute_js` |
+| Google 图搜、特定站点的爬取 | `web_scan`（走真实浏览器保留登录态） |
+| 检索本地 105K 技能卡 | `skill_search` |