From f35e6da5d00b007c09dc4b6908ac5a0db2308125 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Tue, 17 Feb 2026 09:22:54 +0000 Subject: [PATCH 1/2] migrate deepeval away from vertex_ai - use gemini for higher rate limit Signed-off-by: Jack Luar --- evaluation/auto_evaluation/eval_main.py | 4 +- .../auto_evaluation/src/models/gemini.py | 91 ++++++++++++++ .../auto_evaluation/src/models/vertex_ai.py | 100 --------------- .../models/gemini_model.py | 115 ++++++++---------- .../utils/api_utils.py | 8 +- 5 files changed, 146 insertions(+), 172 deletions(-) create mode 100644 evaluation/auto_evaluation/src/models/gemini.py delete mode 100644 evaluation/auto_evaluation/src/models/vertex_ai.py diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py index 7b79bd44..4023dc1c 100644 --- a/evaluation/auto_evaluation/eval_main.py +++ b/evaluation/auto_evaluation/eval_main.py @@ -43,8 +43,7 @@ def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""): self.qns = preprocess.read_data(self.dataset) self.eval_model = GeminiModel( model_name="gemini-2.5-pro", - project=os.getenv("GOOGLE_PROJECT_ID", ""), - location=os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1"), + api_key=os.getenv("GOOGLE_API_KEY"), ) self.log_dir = "logs" os.makedirs(self.log_dir, exist_ok=True) @@ -108,7 +107,6 @@ def evaluate(self, retriever: str, limit: int | None = None): retrieval_tcs.append(retrieval_tc) response_times.append(response_time) - # parallel evaluate evaluate( test_cases=retrieval_tcs, metrics=[precision, recall, hallucination], diff --git a/evaluation/auto_evaluation/src/models/gemini.py b/evaluation/auto_evaluation/src/models/gemini.py new file mode 100644 index 00000000..5f32d93a --- /dev/null +++ b/evaluation/auto_evaluation/src/models/gemini.py @@ -0,0 +1,91 @@ +""" +Custom DeepEvalLLM wrapper using Google Gemini API (google.genai). +""" + +import os +from typing import Any, Optional, Type + +from google import genai +from google.genai import types +from deepeval.models.base_model import DeepEvalBaseLLM +from pydantic import BaseModel + + +class Response(BaseModel): + content: str + + +class GoogleGeminiLangChain(DeepEvalBaseLLM): + """Class that implements Google Gemini API for DeepEval""" + + def __init__(self, model_name, *args, **kwargs): + self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) + super().__init__(model_name, *args, **kwargs) + + def load_model(self, *args, **kwargs): + return self.client.models + + def generate(self, prompt: str, schema: Optional[Type[BaseModel]] = None) -> Any: + if schema is not None: + response = self.client.models.generate_content( + model=self.model_name, + contents=prompt, + config=types.GenerateContentConfig( + response_mime_type="application/json", + response_schema=schema, + ), + ) + return response.parsed, 0 + else: + response = self.client.models.generate_content( + model=self.model_name, + contents=prompt, + ) + return response.text, 0 + + async def a_generate( + self, prompt: str, schema: Optional[Type[BaseModel]] = None + ) -> Any: + if schema is not None: + response = await self.client.aio.models.generate_content( + model=self.model_name, + contents=prompt, + config=types.GenerateContentConfig( + response_mime_type="application/json", + response_schema=schema, + ), + ) + return response.parsed, 0 + else: + response = await self.client.aio.models.generate_content( + model=self.model_name, + contents=prompt, + ) + return response.text, 0 + + def get_model_name(self): + return self.model_name or "model-not-specified" + + +def main(): + model = GoogleGeminiLangChain(model_name="gemini-2.5-pro") + prompt = "Write me a joke" + print(f"Prompt: {prompt}") + response = model.generate(prompt, schema=Response) + print(f"Response: {response}") + + +async def main_async(): + model = GoogleGeminiLangChain(model_name="gemini-2.5-pro") + prompt = "Write me a joke" + print(f"Prompt: {prompt}") + response = await model.a_generate(prompt, schema=Response) + print(f"Response: {response}") + + +if __name__ == "__main__": + import asyncio + from dotenv import load_dotenv + + load_dotenv() + asyncio.run(main_async()) diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py deleted file mode 100644 index 132f784b..00000000 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Code is adapted from https://github.com/meteatamel/genai-beyond-basics/blob/main/samples/evaluation/deepeval/vertex_ai/google_vertex_ai_langchain.py -Custom DeepEvalLLM wrapper. -""" - -import instructor - -from typing import Any, Type -from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory -from deepeval.models.base_model import DeepEvalBaseLLM -from pydantic import BaseModel - - -class Response(BaseModel): - content: str - - -class GoogleVertexAILangChain(DeepEvalBaseLLM): - """Class that implements Vertex AI via LangChain for DeepEval""" - - def __init__(self, model_name, *args, **kwargs): - super().__init__(model_name, *args, **kwargs) - - def load_model(self, *args, **kwargs): - # Initialize safety filters for Vertex AI model - # This is important to ensure no evaluation responses are blocked - safety_settings = { - HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, - } - if not self.model_name: - raise ValueError("Model name must be specified for Google Vertex AI.") - - return GenerativeModel( - model_name=self.model_name, - safety_settings=safety_settings, - ) - - def generate(self, prompt: str, schema: Type[BaseModel]) -> Any: - instructor_client = instructor.from_vertexai( - client=self.load_model(), - mode=instructor.Mode.VERTEXAI_TOOLS, - ) - resp = instructor_client.messages.create( - messages=[ - { - "role": "user", - "content": prompt, - } - ], - response_model=schema, - ) - return resp - - async def a_generate(self, prompt: str, schema: Any) -> Any: - instructor_client = instructor.from_vertexai( - client=self.load_model(), - mode=instructor.Mode.VERTEXAI_TOOLS, - ) - resp = await instructor_client.messages.create( - messages=[ - { - "role": "user", - "content": prompt, - } - ], - response_model=schema, - ) - return resp - - def get_model_name(self): - return self.model_name or "model-not-specified" - - -def main(): - model = GoogleVertexAILangChain(model_name="gemini-2.5-pro") - prompt = "Write me a joke" - print(f"Prompt: {prompt}") - response = model.generate(prompt, schema=Response) - print(f"Response: {response}") - - -async def main_async(): - model = GoogleVertexAILangChain(model_name="gemini-2.5-pro") - prompt = "Write me a joke" - print(f"Prompt: {prompt}") - response = await model.a_generate(prompt, schema=Response) - print(f"Response: {response}") - - -if __name__ == "__main__": - import asyncio - from dotenv import load_dotenv - - load_dotenv() - # main() - asyncio.run(main_async()) diff --git a/evaluation/script_based_evaluation/models/gemini_model.py b/evaluation/script_based_evaluation/models/gemini_model.py index bbd7caf9..f4cb8d49 100644 --- a/evaluation/script_based_evaluation/models/gemini_model.py +++ b/evaluation/script_based_evaluation/models/gemini_model.py @@ -1,51 +1,55 @@ +import os import time import sys import traceback -import vertexai.preview.generative_models as genai -from vertexai.generative_models import ( - HarmCategory, - HarmBlockThreshold, - SafetySetting, -) +from google import genai +from google.genai import types from script_based_evaluation.utils.logging_utils import log_error +_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) + +_safety_config = [ + types.SafetySetting( + category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold=types.HarmBlockThreshold.BLOCK_NONE, + ), + types.SafetySetting( + category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold=types.HarmBlockThreshold.BLOCK_NONE, + ), + types.SafetySetting( + category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold=types.HarmBlockThreshold.BLOCK_NONE, + ), + types.SafetySetting( + category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold=types.HarmBlockThreshold.BLOCK_NONE, + ), +] + def base_gemini_1_5_flash(query: str) -> tuple[str, float]: - safety_config = [ - SafetySetting( - category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_HARASSMENT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - ] while True: try: - model = genai.GenerativeModel("gemini-2.0-flash") start_time = time.time() - query = " " + query - response = model.generate_content(query, safety_settings=safety_config) + response = _client.models.generate_content( + model="gemini-2.0-flash", + contents=" " + query, + config=types.GenerateContentConfig( + safety_settings=_safety_config, + ), + ) end_time = time.time() - response_time = (end_time - start_time) * 1000 # Convert to milliseconds - return response.text, response_time + response_time = (end_time - start_time) * 1000 + return response.text or "", response_time except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): - print("Rate limit exceeded, sleeping for 10 seconds") - time.sleep(10) + print("Rate limit exceeded, sleeping for 60 seconds") + time.sleep(60) else: - error_message = f"Error in base_gemini_1_5_flash: {str(e)}" - error_details = traceback.format_exc() - log_error(error_message, error_details) + log_error( + f"Error in base_gemini_1_5_flash: {str(e)}", traceback.format_exc() + ) print( "An error occurred while sending request to Gemini. Check error_log.txt for details." ) @@ -53,48 +57,29 @@ def base_gemini_1_5_flash(query: str) -> tuple[str, float]: def base_gemini_1_5_pro(query: str) -> tuple[str, float]: - safety_config = [ - SafetySetting( - category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - SafetySetting( - category=HarmCategory.HARM_CATEGORY_HARASSMENT, - threshold=HarmBlockThreshold.BLOCK_NONE, - ), - ] while True: try: - model = genai.GenerativeModel("gemini-2.5-pro") start_time = time.time() - query = " " + query - response = model.generate_content( - query, - safety_settings=safety_config, - generation_config=genai.GenerationConfig( + response = _client.models.generate_content( + model="gemini-2.5-pro", + contents=" " + query, + config=types.GenerateContentConfig( + safety_settings=_safety_config, max_output_tokens=2000, temperature=0.0, ), ) end_time = time.time() - response_time = (end_time - start_time) * 1000 # Convert to milliseconds - return response.text, response_time + response_time = (end_time - start_time) * 1000 + return response.text or "", response_time except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): - print("Rate limit exceeded, sleeping for 10 seconds") - time.sleep(10) + print("Rate limit exceeded, sleeping for 60 seconds") + time.sleep(60) else: - error_message = f"Error in base_gemini_1_5_flash: {str(e)}" - error_details = traceback.format_exc() - log_error(error_message, error_details) + log_error( + f"Error in base_gemini_1_5_pro: {str(e)}", traceback.format_exc() + ) print( "An error occurred while sending request to Gemini. Check error_log.txt for details." ) diff --git a/evaluation/script_based_evaluation/utils/api_utils.py b/evaluation/script_based_evaluation/utils/api_utils.py index 9fea0dc9..4b466045 100644 --- a/evaluation/script_based_evaluation/utils/api_utils.py +++ b/evaluation/script_based_evaluation/utils/api_utils.py @@ -53,8 +53,8 @@ def llm_judge(prompt: str) -> str: return response_text except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): - print("Rate limit exceeded, sleeping for 10 seconds") - time.sleep(10) + print("Rate limit exceeded, sleeping for 60 seconds") + time.sleep(60) else: log_error(f"Error in llm_judge: {str(e)}", traceback.format_exc()) print( @@ -70,8 +70,8 @@ def send_request_gemini(prompt: str) -> str: return response_text except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): - print("Rate limit exceeded, sleeping for 10 seconds") - time.sleep(10) + print("Rate limit exceeded, sleeping for 60 seconds") + time.sleep(60) else: log_error( f"Error in send_request_gemini: {str(e)}", traceback.format_exc() From 569fb81fcd8bf5d9bd5ea60590d7f48c6f6f6645 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Tue, 17 Feb 2026 09:35:17 +0000 Subject: [PATCH 2/2] cleanups - remove unnecessary prints - remove unused vertexai imports Signed-off-by: Jack Luar --- .../script_based_evaluation/config/config.py | 10 ++++------ evaluation/script_based_evaluation/main.py | 4 ---- .../models/gemini_model.py | 16 ++++------------ .../models/gpt_model.py | 3 --- .../script_based_evaluation/utils/api_utils.py | 18 ++++++------------ 5 files changed, 14 insertions(+), 37 deletions(-) diff --git a/evaluation/script_based_evaluation/config/config.py b/evaluation/script_based_evaluation/config/config.py index 2b4c607d..700303d8 100644 --- a/evaluation/script_based_evaluation/config/config.py +++ b/evaluation/script_based_evaluation/config/config.py @@ -8,13 +8,11 @@ def load_environment(env_path: str): raise FileNotFoundError(f"The specified .env file does not exist at {env_path}") load_dotenv(env_path, override=True) config = dotenv_values(env_path) - google_creds: Optional[str] = config.get("GOOGLE_APPLICATION_CREDENTIALS") - if google_creds is not None: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_creds + api_key: Optional[str] = config.get("GOOGLE_API_KEY") + if api_key is not None: + os.environ["GOOGLE_API_KEY"] = api_key else: - raise KeyError( - "GOOGLE_APPLICATION_CREDENTIALS not found in .env file or is None" - ) + raise KeyError("GOOGLE_API_KEY not found in .env file or is None") def get_config() -> dict[str, str]: diff --git a/evaluation/script_based_evaluation/main.py b/evaluation/script_based_evaluation/main.py index 442c7964..db28506f 100644 --- a/evaluation/script_based_evaluation/main.py +++ b/evaluation/script_based_evaluation/main.py @@ -11,7 +11,6 @@ from openai import OpenAI from tqdm import tqdm from typing import Any -import vertexai def get_accuracy_value(response_text: str, ground_truth: str, query_text: str) -> str: @@ -256,9 +255,6 @@ def main(): client = OpenAI(api_key=api_key) - # Initialize Vertex AI - vertexai.init() - # Input and Output Files input_file = "data/data.csv" output_file = f"{os.path.splitext(input_file)[0]}_result.csv" diff --git a/evaluation/script_based_evaluation/models/gemini_model.py b/evaluation/script_based_evaluation/models/gemini_model.py index f4cb8d49..81d40b79 100644 --- a/evaluation/script_based_evaluation/models/gemini_model.py +++ b/evaluation/script_based_evaluation/models/gemini_model.py @@ -28,7 +28,7 @@ ] -def base_gemini_1_5_flash(query: str) -> tuple[str, float]: +def base_gemini_flash(query: str) -> tuple[str, float]: while True: try: start_time = time.time() @@ -48,15 +48,12 @@ def base_gemini_1_5_flash(query: str) -> tuple[str, float]: time.sleep(60) else: log_error( - f"Error in base_gemini_1_5_flash: {str(e)}", traceback.format_exc() - ) - print( - "An error occurred while sending request to Gemini. Check error_log.txt for details." + f"Error in base_gemini_flash: {str(e)}", traceback.format_exc() ) sys.exit(1) -def base_gemini_1_5_pro(query: str) -> tuple[str, float]: +def base_gemini_pro(query: str) -> tuple[str, float]: while True: try: start_time = time.time() @@ -77,10 +74,5 @@ def base_gemini_1_5_pro(query: str) -> tuple[str, float]: print("Rate limit exceeded, sleeping for 60 seconds") time.sleep(60) else: - log_error( - f"Error in base_gemini_1_5_pro: {str(e)}", traceback.format_exc() - ) - print( - "An error occurred while sending request to Gemini. Check error_log.txt for details." - ) + log_error(f"Error in base_gemini_pro: {str(e)}", traceback.format_exc()) sys.exit(1) diff --git a/evaluation/script_based_evaluation/models/gpt_model.py b/evaluation/script_based_evaluation/models/gpt_model.py index f91d9a09..a59f6057 100644 --- a/evaluation/script_based_evaluation/models/gpt_model.py +++ b/evaluation/script_based_evaluation/models/gpt_model.py @@ -23,7 +23,4 @@ def base_gpt_4o(query: str, client: OpenAI) -> tuple[str, float]: error_message = f"Error in base_gpt_4o: {str(e)}" error_details = traceback.format_exc() log_error(error_message, error_details) - print( - "An error occurred while sending request to GPT-4. Check error_log.txt for details." - ) sys.exit(1) diff --git a/evaluation/script_based_evaluation/utils/api_utils.py b/evaluation/script_based_evaluation/utils/api_utils.py index 4b466045..25a8e980 100644 --- a/evaluation/script_based_evaluation/utils/api_utils.py +++ b/evaluation/script_based_evaluation/utils/api_utils.py @@ -5,8 +5,8 @@ from script_based_evaluation.utils.logging_utils import log_error from script_based_evaluation.models.gpt_model import base_gpt_4o from script_based_evaluation.models.gemini_model import ( - base_gemini_1_5_flash, - base_gemini_1_5_pro, + base_gemini_flash, + base_gemini_pro, ) from openai import OpenAI @@ -18,8 +18,8 @@ def send_request( print("Sending request to endpoint:", endpoint) if endpoint in agent_retriever_urls: url = f"{agent_retriever_urls[endpoint]}/conversations/agent-retriever" - elif endpoint == "base-gemini-1.5-flash": - response_text, response_time = base_gemini_1_5_flash(query) + elif endpoint == "base-gemini-flash": + response_text, response_time = base_gemini_flash(query) print("Response:", response_text) return response_text, response_time elif endpoint == "base-gpt-4o": @@ -49,7 +49,7 @@ def send_request( def llm_judge(prompt: str) -> str: while True: try: - response_text, _ = base_gemini_1_5_pro(prompt) + response_text, _ = base_gemini_pro(prompt) return response_text except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): @@ -57,16 +57,13 @@ def llm_judge(prompt: str) -> str: time.sleep(60) else: log_error(f"Error in llm_judge: {str(e)}", traceback.format_exc()) - print( - "An error occurred while sending request to Gemini. Check error_log.txt for details." - ) sys.exit(1) def send_request_gemini(prompt: str) -> str: while True: try: - response_text, _ = base_gemini_1_5_flash(prompt) + response_text, _ = base_gemini_flash(prompt) return response_text except Exception as e: if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): @@ -76,7 +73,4 @@ def send_request_gemini(prompt: str) -> str: log_error( f"Error in send_request_gemini: {str(e)}", traceback.format_exc() ) - print( - "An error occurred while sending request to Gemini. Check error_log.txt for details." - ) sys.exit(1)