Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ memory/L4_raw_sessions/*
# Keychain
!memory/keychain.py

# Exa search skill
!memory/exa_search.py
!memory/exa_search_sop.md

# Vision / OCR / UI detection tools
!memory/ocr_utils.py
!memory/vision_sop.md
Expand Down
179 changes: 179 additions & 0 deletions memory/exa_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""Exa web search skill. `from exa_search import search; search("query")`. Needs `pip install exa-py` and EXA_API_KEY (env or `keys.exa_api_key`)."""
import os
from dataclasses import dataclass, field
from typing import Any, List, Optional

_INTEGRATION_TAG = "generic-agent"
_client = None


def _load_api_key() -> str:
key = os.environ.get("EXA_API_KEY")
if key:
return key
# Fall back to local keychain (memory/keychain.py) so users don't have to export env vars
try:
import keychain
return keychain.keys.exa_api_key.use()
except Exception:
pass
raise RuntimeError(
"EXA_API_KEY not set. Either `export EXA_API_KEY=...` or run "
"`from keychain import keys; keys.set('exa_api_key', 'sk-...')` once."
)


def _get_client():
"""Lazy singleton. Sets the x-exa-integration header so Exa can attribute usage."""
global _client
if _client is not None:
return _client
try:
from exa_py import Exa
except ImportError as e:
raise ImportError("exa-py not installed. Run `pip install exa-py`.") from e
c = Exa(_load_api_key())
# Usage attribution. Safe if the SDK ever drops the attribute: falls through to the except.
try:
c.headers["x-exa-integration"] = _INTEGRATION_TAG
except Exception:
pass
_client = c
return c


@dataclass
class ExaResult:
"""Typed wrapper for one Exa result. `snippet` cascades through highlights/summary/text."""
title: str
url: str
snippet: str
published_date: Optional[str] = None
author: Optional[str] = None
score: Optional[float] = None
highlights: List[str] = field(default_factory=list)
summary: Optional[str] = None
text: Optional[str] = None

def __repr__(self):
s = self.snippet
if len(s) > 120:
s = s[:117] + "..."
return f"ExaResult(url={self.url!r}, snippet={s!r})"


def _extract_snippet(r) -> str:
"""Pick the best short preview: highlights → summary → text. Any may be missing."""
hs = getattr(r, "highlights", None) or []
if hs:
joined = " ".join(h for h in hs if h)
if joined:
return joined[:500]
summ = getattr(r, "summary", None)
if summ:
return summ[:500]
txt = getattr(r, "text", None)
if txt:
return txt[:500]
return ""


def _convert(r) -> ExaResult:
return ExaResult(
title=getattr(r, "title", None) or "",
url=getattr(r, "url", None) or "",
snippet=_extract_snippet(r),
published_date=getattr(r, "published_date", None),
author=getattr(r, "author", None),
score=getattr(r, "score", None),
highlights=list(getattr(r, "highlights", None) or []),
summary=getattr(r, "summary", None),
text=getattr(r, "text", None),
)


def search(
query: str,
*,
num_results: int = 10,
search_type: str = "auto",
category: Optional[str] = None,
include_domains: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
include_text: Optional[List[str]] = None,
exclude_text: Optional[List[str]] = None,
start_published_date: Optional[str] = None,
end_published_date: Optional[str] = None,
text: Any = None,
highlights: Any = True,
summary: Any = None,
) -> List[ExaResult]:
"""
Semantic web search via Exa. Returns list[ExaResult].

search_type: 'auto' (default) | 'neural' | 'fast' | 'instant' | 'deep' | 'deep-lite' | 'deep-reasoning'
category: 'company' | 'research paper' | 'news' | 'personal site' | 'financial report' | 'people'

Content flags (pass True, a dict like {'maxCharacters': 500}, or None to omit):
- highlights: short relevant excerpts (default: True, gives compact snippets)
- text: full page text (use sparingly — large payloads)
- summary: LLM-distilled summary, dict only, e.g. {'query': 'key findings'}

Dates are ISO 8601 strings (e.g. '2025-01-01T00:00:00Z').
"""
kwargs: dict = {"num_results": num_results, "type": search_type}
if category: kwargs["category"] = category
if include_domains: kwargs["include_domains"] = include_domains
if exclude_domains: kwargs["exclude_domains"] = exclude_domains
if include_text: kwargs["include_text"] = include_text
if exclude_text: kwargs["exclude_text"] = exclude_text
if start_published_date: kwargs["start_published_date"] = start_published_date
if end_published_date: kwargs["end_published_date"] = end_published_date
if text is not None: kwargs["text"] = text
if highlights is not None: kwargs["highlights"] = highlights
if summary is not None: kwargs["summary"] = summary

resp = _get_client().search_and_contents(query, **kwargs)
return [_convert(r) for r in (getattr(resp, "results", None) or [])]


def find_similar(
url: str,
*,
num_results: int = 10,
highlights: Any = True,
text: Any = None,
summary: Any = None,
) -> List[ExaResult]:
"""Find pages semantically similar to `url`. Same content flags as search()."""
kwargs: dict = {"num_results": num_results}
if highlights is not None: kwargs["highlights"] = highlights
if text is not None: kwargs["text"] = text
if summary is not None: kwargs["summary"] = summary
resp = _get_client().find_similar_and_contents(url, **kwargs)
return [_convert(r) for r in (getattr(resp, "results", None) or [])]


def get_contents(
urls: List[str],
*,
text: Any = True,
highlights: Any = None,
summary: Any = None,
) -> List[ExaResult]:
"""Fetch page contents for known URLs (bypasses search)."""
kwargs: dict = {}
if text is not None: kwargs["text"] = text
if highlights is not None: kwargs["highlights"] = highlights
if summary is not None: kwargs["summary"] = summary
resp = _get_client().get_contents(urls, **kwargs)
return [_convert(r) for r in (getattr(resp, "results", None) or [])]


if __name__ == "__main__":
# CLI: python exa_search.py "<query>" [num_results]
import sys
q = sys.argv[1] if len(sys.argv) > 1 else "latest LLM research"
n = int(sys.argv[2]) if len(sys.argv) > 2 else 5
for r in search(q, num_results=n):
print(f"- {r.title}\n {r.url}\n {r.snippet[:200]}\n")
123 changes: 123 additions & 0 deletions memory/exa_search_sop.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Exa Search SOP

> Semantic web search with content retrieval. Use when `web_scan` / Google scraping is too noisy or when you need structured, typed results fast.

**触发**:需要搜索高质量网页内容、研究论文、公司信息、新闻,或需要"语义相似页面"
**禁用**:只需要浏览器自动化(走 `web_scan` / `web_execute_js`);只需搜本地 skill 库(走 `skill_search`)

## 一次性准备

```bash
pip install exa-py
```

配置 API Key(二选一):
```python
# 方式A:环境变量
# export EXA_API_KEY=sk-xxx

# 方式B:keychain(推荐,跨会话持久化)
import sys; sys.path.append('../memory')
from keychain import keys
keys.set('exa_api_key', 'sk-xxx') # 一次性,后续自动读取
```

从 https://dashboard.exa.ai/api-keys 获取 key。

## 最简调用

```python
import sys; sys.path.append('../memory')
from exa_search import search

results = search("state of the art retrieval augmented generation 2025")
for r in results:
print(f"- {r.title} ({r.url})")
print(f" {r.snippet[:200]}")
```

返回 `list[ExaResult]`,字段:`title / url / snippet / published_date / author / score / highlights / summary / text`。
`snippet` 自动从 highlights → summary → text 级联提取,不用手动 fallback。

## API 签名

```python
search(query, *,
num_results=10,
search_type='auto', # 'auto'|'neural'|'fast'|'instant'|'deep'|'deep-lite'|'deep-reasoning'
category=None, # 'company'|'research paper'|'news'|'personal site'|'financial report'|'people'
include_domains=None, # list[str]
exclude_domains=None,
include_text=None, # list[str] 必含词
exclude_text=None,
start_published_date=None, end_published_date=None, # ISO 8601
text=None, # True/dict/None 完整正文
highlights=True, # True/dict/None 高亮片段(默认开)
summary=None) # dict only, e.g. {'query':'key findings'}

find_similar(url, *, num_results=10, highlights=True, text=None, summary=None)
get_contents(urls, *, text=True, highlights=None, summary=None)
```

## 典型场景

### 1. 研究论文检索
```python
search("contrastive learning for dense retrieval",
category="research paper", num_results=20,
start_published_date="2024-01-01T00:00:00Z")
```

### 2. 公司尽调(限定域名)
```python
search("Anthropic funding rounds",
category="company",
include_domains=["crunchbase.com", "techcrunch.com"])
```

### 3. 要完整正文 + 摘要
```python
# 可同时拿三种内容,不是互斥
search("Kimi K2 benchmarks",
text={"maxCharacters": 3000},
highlights={"maxCharacters": 500, "query": "MMLU scores"},
summary={"query": "benchmark results"})
```

### 4. 已知 URL 拿正文(不走搜索)
```python
from exa_search import get_contents
[page] = get_contents(["https://arxiv.org/abs/2501.00001"])
print(page.text)
```

### 5. 以图搜图式的语义近邻
```python
from exa_search import find_similar
find_similar("https://openai.com/index/gpt-5/", num_results=8)
```

## CLI

```bash
python ../memory/exa_search.py "agent frameworks 2025" 5
```

## 避坑

- ⚠️ **Exa 不再有 `keyword` search type**,老文档里的 `type="keyword"` 会报错,用 `type="fast"` 或 `type="auto"`
- ⚠️ **不要 `text=True` + `num_results=50`**:每个结果 ~几 KB 正文,会直接炸 LLM 上下文。要批量召回先只开 `highlights`,再对感兴趣的 URL 调 `get_contents`
- ⚠️ **`summary` 参数必须是 dict**,传 `True` 会报错(与 `text`/`highlights` 不同)
- ⚠️ **日期要 ISO 8601**,不是 `"2024-01-01"`,要带 `T00:00:00Z`
- ⚠️ **首次导入会起一个 client 单例**,切换 key 要重启 Python 或 `import exa_search; exa_search._client = None`
- ⚠️ **网络失败不会自动重试**:在 autonomous flow 里建议包一层 try / 指数退避
- ⚠️ **include_text / exclude_text 按词组匹配**,不是正则;每项建议 ≤5 词以免命中率为零

## 何时用 Exa vs 其他工具

| 场景 | 工具 |
|---|---|
| 需要高相关度的主题检索、论文、研究、公司信息 | **exa_search** |
| 要登录后的页面内容 / 浏览器会话中的交互 | `web_scan` + `web_execute_js` |
| Google 图搜、特定站点的爬取 | `web_scan`(走真实浏览器保留登录态) |
| 检索本地 105K 技能卡 | `skill_search` |
Loading