Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies = [
"simpleeval>=1.0.3",
"jsonschema>=4.25.1",
"duckduckgo-search>=8.1.1",
"tavily-python>=0.3.0",
"pydantic>=2.12.5",
"scrapegraph-py>=1.44.0",
]
Expand Down
58 changes: 57 additions & 1 deletion scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
error handling, validation, and security features.
"""

import os
import random
import re
import time
Expand Down Expand Up @@ -57,13 +58,14 @@ class SearchConfig(BaseModel):
None, description="Proxy configuration"
)
serper_api_key: Optional[str] = Field(None, description="API key for Serper")
tavily_api_key: Optional[str] = Field(None, description="API key for Tavily")
region: Optional[str] = Field(None, description="Country/region code")
language: str = Field("en", description="Language code")

@validator("search_engine")
def validate_search_engine(cls, v):
"""Validate search engine."""
valid_engines = {"duckduckgo", "bing", "searxng", "serper"}
valid_engines = {"duckduckgo", "bing", "searxng", "serper", "tavily"}
if v.lower() not in valid_engines:
raise ValueError(
f"Search engine must be one of: {', '.join(valid_engines)}"
Expand Down Expand Up @@ -166,6 +168,7 @@ def search_on_web(
timeout: int = 10,
proxy: Optional[Union[str, Dict, ProxyConfig]] = None,
serper_api_key: Optional[str] = None,
tavily_api_key: Optional[str] = None,
region: Optional[str] = None,
language: str = "en",
) -> List[str]:
Expand All @@ -180,6 +183,7 @@ def search_on_web(
timeout (int): Request timeout in seconds
proxy (str | dict | ProxyConfig): Proxy configuration
serper_api_key (str): API key for Serper
tavily_api_key (str): API key for Tavily
region (str): Country/region code (e.g., 'mx' for Mexico)
language (str): Language code (e.g., 'es' for Spanish)

Expand All @@ -204,6 +208,7 @@ def search_on_web(
timeout=timeout,
proxy=proxy,
serper_api_key=serper_api_key,
tavily_api_key=tavily_api_key,
region=region,
language=language,
)
Expand Down Expand Up @@ -237,6 +242,11 @@ def search_on_web(
config.query, config.max_results, config.serper_api_key, config.timeout
)

elif config.search_engine == "tavily":
results = _search_tavily(
config.query, config.max_results, config.tavily_api_key, config.timeout
)

return filter_pdf_links(results)

except requests.Timeout:
Expand Down Expand Up @@ -381,6 +391,52 @@ def _search_serper(
raise SearchRequestError(f"Serper search failed: {str(e)}")


def _search_tavily(
query: str, max_results: int, api_key: Optional[str], timeout: int
) -> List[str]:
"""
Helper function for Tavily search.

Args:
query (str): Search query
max_results (int): Maximum number of results to return
api_key (str, optional): API key for Tavily. Falls back to TAVILY_API_KEY env var.
timeout (int): Request timeout in seconds

Returns:
List[str]: List of URLs from search results
"""
resolved_key = api_key or os.environ.get("TAVILY_API_KEY")
if not resolved_key:
raise SearchConfigError(
"Tavily API key is required. Provide tavily_api_key or set TAVILY_API_KEY env var."
)

try:
from tavily import TavilyClient

client = TavilyClient(api_key=resolved_key)
response = client.search(
query=query,
max_results=max_results,
search_depth="basic",
)

results = [
result["url"]
for result in response.get("results", [])
if "url" in result
]
return results[:max_results]
except ImportError:
raise SearchConfigError(
"tavily-python package is required for Tavily search. "
"Install it with: pip install tavily-python"
)
except Exception as e:
raise SearchRequestError(f"Tavily search failed: {str(e)}")


def format_proxy(proxy_config: Union[str, Dict, ProxyConfig]) -> str:
"""
Format proxy configuration into a string.
Expand Down