From 5d545fe8770aa3be70a439159e074f3bcf17dc10 Mon Sep 17 00:00:00 2001 From: Tavily PR Agent Date: Mon, 30 Mar 2026 15:17:10 +0000 Subject: [PATCH] feat: add Tavily as search provider option in research_web utility --- pyproject.toml | 1 + scrapegraphai/utils/research_web.py | 58 ++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6537bbcf..e947c74a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "simpleeval>=1.0.3", "jsonschema>=4.25.1", "duckduckgo-search>=8.1.1", + "tavily-python>=0.3.0", "pydantic>=2.12.5", "scrapegraph-py>=1.44.0", ] diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index d633084d..a2c38430 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -3,6 +3,7 @@ error handling, validation, and security features. """ +import os import random import re import time @@ -57,13 +58,14 @@ class SearchConfig(BaseModel): None, description="Proxy configuration" ) serper_api_key: Optional[str] = Field(None, description="API key for Serper") + tavily_api_key: Optional[str] = Field(None, description="API key for Tavily") region: Optional[str] = Field(None, description="Country/region code") language: str = Field("en", description="Language code") @validator("search_engine") def validate_search_engine(cls, v): """Validate search engine.""" - valid_engines = {"duckduckgo", "bing", "searxng", "serper"} + valid_engines = {"duckduckgo", "bing", "searxng", "serper", "tavily"} if v.lower() not in valid_engines: raise ValueError( f"Search engine must be one of: {', '.join(valid_engines)}" @@ -166,6 +168,7 @@ def search_on_web( timeout: int = 10, proxy: Optional[Union[str, Dict, ProxyConfig]] = None, serper_api_key: Optional[str] = None, + tavily_api_key: Optional[str] = None, region: Optional[str] = None, language: str = "en", ) -> List[str]: @@ -180,6 +183,7 @@ def search_on_web( timeout (int): Request timeout in seconds proxy (str | dict | ProxyConfig): Proxy configuration serper_api_key (str): API key for Serper + tavily_api_key (str): API key for Tavily region (str): Country/region code (e.g., 'mx' for Mexico) language (str): Language code (e.g., 'es' for Spanish) @@ -204,6 +208,7 @@ def search_on_web( timeout=timeout, proxy=proxy, serper_api_key=serper_api_key, + tavily_api_key=tavily_api_key, region=region, language=language, ) @@ -237,6 +242,11 @@ def search_on_web( config.query, config.max_results, config.serper_api_key, config.timeout ) + elif config.search_engine == "tavily": + results = _search_tavily( + config.query, config.max_results, config.tavily_api_key, config.timeout + ) + return filter_pdf_links(results) except requests.Timeout: @@ -381,6 +391,52 @@ def _search_serper( raise SearchRequestError(f"Serper search failed: {str(e)}") +def _search_tavily( + query: str, max_results: int, api_key: Optional[str], timeout: int +) -> List[str]: + """ + Helper function for Tavily search. + + Args: + query (str): Search query + max_results (int): Maximum number of results to return + api_key (str, optional): API key for Tavily. Falls back to TAVILY_API_KEY env var. + timeout (int): Request timeout in seconds + + Returns: + List[str]: List of URLs from search results + """ + resolved_key = api_key or os.environ.get("TAVILY_API_KEY") + if not resolved_key: + raise SearchConfigError( + "Tavily API key is required. Provide tavily_api_key or set TAVILY_API_KEY env var." + ) + + try: + from tavily import TavilyClient + + client = TavilyClient(api_key=resolved_key) + response = client.search( + query=query, + max_results=max_results, + search_depth="basic", + ) + + results = [ + result["url"] + for result in response.get("results", []) + if "url" in result + ] + return results[:max_results] + except ImportError: + raise SearchConfigError( + "tavily-python package is required for Tavily search. " + "Install it with: pip install tavily-python" + ) + except Exception as e: + raise SearchRequestError(f"Tavily search failed: {str(e)}") + + def format_proxy(proxy_config: Union[str, Dict, ProxyConfig]) -> str: """ Format proxy configuration into a string.