From 046920057779d7356b05aaf4ea8328b071978724 Mon Sep 17 00:00:00 2001 From: PR Bot Date: Wed, 25 Mar 2026 16:54:28 +0800 Subject: [PATCH] feat: add MiniMax as alternative LLM provider for evaluation - Make GPTEvaluation support configurable LLM providers (OpenAI, MiniMax) - Add MINIMAX_API_KEY environment variable support with auto-detection - Add temperature clamping for MiniMax API constraints - Strip reasoning tags from MiniMax M2.7 responses - Add --provider and --model CLI arguments to evaluation.py - Add 31 unit tests and 3 integration tests - Update README with MiniMax evaluation provider documentation --- README.md | 23 ++- challenge/evaluation.py | 10 +- challenge/gpt_eval.py | 125 +++++++++++-- challenge/tests/test_gpt_eval.py | 269 ++++++++++++++++++++++++++++ challenge/tests/test_integration.py | 61 +++++++ 5 files changed, 470 insertions(+), 18 deletions(-) create mode 100644 challenge/tests/test_gpt_eval.py create mode 100644 challenge/tests/test_integration.py diff --git a/README.md b/README.md index 30527f81..8aa86468 100644 --- a/README.md +++ b/README.md @@ -121,11 +121,32 @@ https://github.com/OpenDriveLab/DriveLM-new/assets/75412366/78c32442-73c8-4f1d-a ## Getting Started -To get started with DriveLM: +To get started with DriveLM: - [Prepare DriveLM-nuScenes](/docs/data_prep_nus.md) - [Challenge devkit](/challenge/) - [More content coming soon](#todolist) +### LLM Provider for Evaluation + +The evaluation pipeline uses an LLM to compute GPT-score. By default it uses OpenAI, but [MiniMax](https://platform.minimax.io) is also supported as an alternative provider. + +| Provider | Env Variable | Default Model | API Docs | +|----------|-------------|---------------|----------| +| OpenAI | `OPENAI_API_KEY` | `gpt-3.5-turbo` | [docs](https://platform.openai.com/docs) | +| MiniMax | `MINIMAX_API_KEY` | `MiniMax-M2.7` | [docs](https://platform.minimax.io/docs/api-reference/text-openai-api) | + +```bash +# Using OpenAI (default) +export OPENAI_API_KEY="your-openai-key" +python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json + +# Using MiniMax +export MINIMAX_API_KEY="your-minimax-key" +python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json --provider minimax + +# Auto-detect: if MINIMAX_API_KEY is set (and OPENAI_API_KEY is not), MiniMax is used automatically +``` +

(back to top)

diff --git a/challenge/evaluation.py b/challenge/evaluation.py index afd646db..4ed1962a 100644 --- a/challenge/evaluation.py +++ b/challenge/evaluation.py @@ -12,9 +12,9 @@ class evaluation_suit(): - def __init__(self): + def __init__(self, provider=None, model=None): self.language_eval = language_evaluation.CocoEvaluator(coco_types=["BLEU", "ROUGE_L", "CIDEr"]) - self.chatgpt_eval = GPTEvaluation() + self.chatgpt_eval = GPTEvaluation(provider=provider, model=model) self.GPT = [] self.accuracy = {"answer": [], "GT": []} self.language = {"answer": [], "GT": []} @@ -153,6 +153,10 @@ def evaluation(self): parser = argparse.ArgumentParser(description='Evaluation') parser.add_argument('--root_path1', type=str, default="./llama-adapter-DriveLM.json", help='path to prediction file') parser.add_argument('--root_path2', type=str, default="./test_v1.json", help='path to test file') + parser.add_argument('--provider', type=str, default=None, choices=["openai", "minimax"], + help='LLM provider for GPT-score evaluation (default: auto-detect from env)') + parser.add_argument('--model', type=str, default=None, + help='Model name override for GPT-score evaluation') args = parser.parse_args() with open(args.root_path1, 'r') as f :#, \ @@ -162,7 +166,7 @@ def evaluation(self): with open(args.root_path2, 'r') as f: test_file = json.load(f) - evaluation = evaluation_suit() + evaluation = evaluation_suit(provider=args.provider, model=args.model) for scene_id in test_file.keys(): scene_data = test_file[scene_id]['key_frames'] diff --git a/challenge/gpt_eval.py b/challenge/gpt_eval.py index d56caad5..fbd9129c 100644 --- a/challenge/gpt_eval.py +++ b/challenge/gpt_eval.py @@ -1,37 +1,117 @@ -import pickle -import pdb +import os +import re import numpy as np -import torch import json import argparse from multiprocessing import Pool from openai import OpenAI +# Supported LLM providers and their default configurations +PROVIDER_CONFIGS = { + "openai": { + "env_key": "OPENAI_API_KEY", + "base_url": None, # use OpenAI SDK default + "default_model": "gpt-3.5-turbo", + }, + "minimax": { + "env_key": "MINIMAX_API_KEY", + "base_url": "https://api.minimax.io/v1", + "default_model": "MiniMax-M2.7", + }, +} + + +def _resolve_provider(): + """Auto-detect provider from environment variables. + + Priority: EVAL_LLM_PROVIDER env var > MINIMAX_API_KEY presence > OPENAI_API_KEY presence. + """ + explicit = os.environ.get("EVAL_LLM_PROVIDER", "").lower() + if explicit in PROVIDER_CONFIGS: + return explicit + + if os.environ.get("MINIMAX_API_KEY"): + return "minimax" + if os.environ.get("OPENAI_API_KEY"): + return "openai" + + return "openai" + + +def _strip_think_tags(content): + """Strip ... reasoning blocks from model output.""" + if content and "" in content: + return re.sub(r"[\s\S]*?\s*", "", content).strip() + return content + + +def _clamp_temperature(temperature, provider): + """Clamp temperature to valid range for the provider.""" + if provider == "minimax" and temperature is not None: + # MiniMax requires temperature in (0.0, 1.0] + return max(0.01, min(temperature, 1.0)) + return temperature + + class GPTEvaluation: - def __init__(self): - self.client = OpenAI(api_key="you need to use your own openai key for evaluation on your local machine") + """LLM-based evaluation scorer supporting multiple providers. + + Supported providers: + - ``openai``: OpenAI API (default) + - ``minimax``: MiniMax API (OpenAI-compatible) - def call_chatgpt(self, chatgpt_messages, max_tokens=40, model="gpt-3.5-turbo"): + The provider is chosen by ``provider`` argument, the ``EVAL_LLM_PROVIDER`` + environment variable, or auto-detected from available API keys. + """ + + def __init__(self, provider=None, api_key=None, base_url=None, model=None): + self.provider = provider or _resolve_provider() + if self.provider not in PROVIDER_CONFIGS: + raise ValueError( + f"Unsupported provider '{self.provider}'. " + f"Choose from: {', '.join(PROVIDER_CONFIGS)}" + ) + + cfg = PROVIDER_CONFIGS[self.provider] + resolved_key = api_key or os.environ.get(cfg["env_key"]) + if not resolved_key: + raise ValueError( + f"API key not found. Set {cfg['env_key']} environment variable " + f"or pass api_key to GPTEvaluation()." + ) + + resolved_base_url = base_url or cfg["base_url"] + self.default_model = model or cfg["default_model"] + + client_kwargs = {"api_key": resolved_key} + if resolved_base_url: + client_kwargs["base_url"] = resolved_base_url + self.client = OpenAI(**client_kwargs) + + def call_chatgpt(self, chatgpt_messages, max_tokens=40, model=None): + model = model or self.default_model + temperature = _clamp_temperature(0.6, self.provider) response = self.client.chat.completions.create( - model=model, messages=chatgpt_messages, temperature=0.6, max_tokens=max_tokens + model=model, messages=chatgpt_messages, temperature=temperature, max_tokens=max_tokens ) reply = response.choices[0].message.content + reply = _strip_think_tags(reply) total_tokens = response.usage.total_tokens return reply, total_tokens - + def prepare_chatgpt_message(self, prompt): system_message = "an evaluator who rates my answer based on the correct answer" messages = [{"role": "system", "content": system_message}] messages.append({"role": "user", "content": "{}".format(prompt)}) - + return messages - + def forward(self, data): answer, GT = data prompts = "Rate my answer based on the correct answer out of 100, with higher scores indicating that the answer is closer to the correct answer, and you should be accurate to single digits like 62, 78, 41,etc. Output the number only" prompts = prompts + "This is the correct answer: " + GT + "This is my answer: " + answer - + output = "" messages = self.prepare_chatgpt_message(prompts) reply, total_tokens = self.call_chatgpt(messages, max_tokens=3000) @@ -42,17 +122,34 @@ def forward(self, data): output = output[:-2] return output - + if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GPT-based Evaluation") + parser.add_argument( + "--provider", + type=str, + default=None, + choices=list(PROVIDER_CONFIGS.keys()), + help="LLM provider for evaluation (default: auto-detect from env)", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Model name override (default: provider-specific)", + ) + args = parser.parse_args() + data = [ ("The ego vehicle should notice the bus next, as it is the third object in the image. The bus is stopped at the intersection, and the ego vehicle should be cautious when approaching the intersection to ensure it does not collide with the bus.", "Firstly, notice . The object is a traffic sign, so the ego vehicle should continue at the same speed. Secondly, notice . The object is a traffic sign, so the ego vehicle should accelerate and continue ahead. Thirdly, notice . The object is stationary, so the ego vehicle should continue ahead at the same speed."), # Add more data here ] - eval = GPTEvaluation() + evaluator = GPTEvaluation(provider=args.provider, model=args.model) + print(f"Using provider: {evaluator.provider} (model: {evaluator.default_model})") with Pool(5) as p: # Change the number based on your CPU cores - scores = p.map(eval.forward, data) + scores = p.map(evaluator.forward, data) print(scores) diff --git a/challenge/tests/test_gpt_eval.py b/challenge/tests/test_gpt_eval.py new file mode 100644 index 00000000..5ca65998 --- /dev/null +++ b/challenge/tests/test_gpt_eval.py @@ -0,0 +1,269 @@ +"""Unit tests for GPTEvaluation multi-provider support.""" + +import os +import json +import pytest +from unittest.mock import patch, MagicMock + +import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from gpt_eval import GPTEvaluation, PROVIDER_CONFIGS, _resolve_provider, _clamp_temperature, _strip_think_tags + + +# --------------------------------------------------------------------------- +# Provider resolution +# --------------------------------------------------------------------------- + + +class TestResolveProvider: + """Tests for _resolve_provider auto-detection logic.""" + + def test_explicit_env_openai(self): + with patch.dict(os.environ, {"EVAL_LLM_PROVIDER": "openai"}, clear=False): + assert _resolve_provider() == "openai" + + def test_explicit_env_minimax(self): + with patch.dict(os.environ, {"EVAL_LLM_PROVIDER": "minimax"}, clear=False): + assert _resolve_provider() == "minimax" + + def test_auto_detect_minimax_key(self): + env = {"MINIMAX_API_KEY": "test-key"} + with patch.dict(os.environ, env, clear=True): + assert _resolve_provider() == "minimax" + + def test_auto_detect_openai_key(self): + env = {"OPENAI_API_KEY": "test-key"} + with patch.dict(os.environ, env, clear=True): + assert _resolve_provider() == "openai" + + def test_minimax_takes_priority_over_openai(self): + env = {"MINIMAX_API_KEY": "mm-key", "OPENAI_API_KEY": "oai-key"} + with patch.dict(os.environ, env, clear=True): + assert _resolve_provider() == "minimax" + + def test_fallback_to_openai_when_no_env(self): + with patch.dict(os.environ, {}, clear=True): + assert _resolve_provider() == "openai" + + def test_explicit_provider_overrides_keys(self): + env = {"EVAL_LLM_PROVIDER": "openai", "MINIMAX_API_KEY": "mm-key"} + with patch.dict(os.environ, env, clear=True): + assert _resolve_provider() == "openai" + + +# --------------------------------------------------------------------------- +# Temperature clamping +# --------------------------------------------------------------------------- + + +class TestClampTemperature: + """Tests for temperature clamping per provider.""" + + def test_minimax_clamps_zero(self): + assert _clamp_temperature(0.0, "minimax") == 0.01 + + def test_minimax_keeps_valid(self): + assert _clamp_temperature(0.6, "minimax") == 0.6 + + def test_minimax_clamps_above_one(self): + assert _clamp_temperature(1.5, "minimax") == 1.0 + + def test_openai_passes_through(self): + assert _clamp_temperature(0.0, "openai") == 0.0 + assert _clamp_temperature(2.0, "openai") == 2.0 + + def test_none_temperature(self): + assert _clamp_temperature(None, "minimax") is None + + +# --------------------------------------------------------------------------- +# GPTEvaluation construction +# --------------------------------------------------------------------------- + + +class TestGPTEvaluationInit: + """Tests for GPTEvaluation initialization.""" + + @patch("gpt_eval.OpenAI") + def test_creates_openai_provider(self, mock_openai_cls): + evaluator = GPTEvaluation(provider="openai", api_key="test-key") + assert evaluator.provider == "openai" + assert evaluator.default_model == "gpt-3.5-turbo" + mock_openai_cls.assert_called_once_with(api_key="test-key") + + @patch("gpt_eval.OpenAI") + def test_creates_minimax_provider(self, mock_openai_cls): + evaluator = GPTEvaluation(provider="minimax", api_key="test-key") + assert evaluator.provider == "minimax" + assert evaluator.default_model == "MiniMax-M2.7" + mock_openai_cls.assert_called_once_with( + api_key="test-key", + base_url="https://api.minimax.io/v1", + ) + + @patch("gpt_eval.OpenAI") + def test_custom_base_url(self, mock_openai_cls): + evaluator = GPTEvaluation( + provider="minimax", + api_key="test-key", + base_url="https://custom.endpoint/v1", + ) + mock_openai_cls.assert_called_once_with( + api_key="test-key", + base_url="https://custom.endpoint/v1", + ) + + @patch("gpt_eval.OpenAI") + def test_custom_model(self, mock_openai_cls): + evaluator = GPTEvaluation( + provider="minimax", api_key="test-key", model="MiniMax-M2.7-highspeed" + ) + assert evaluator.default_model == "MiniMax-M2.7-highspeed" + + def test_unsupported_provider_raises(self): + with pytest.raises(ValueError, match="Unsupported provider"): + GPTEvaluation(provider="unsupported", api_key="key") + + def test_missing_api_key_raises(self): + with patch.dict(os.environ, {}, clear=True): + with pytest.raises(ValueError, match="API key not found"): + GPTEvaluation(provider="minimax") + + @patch("gpt_eval.OpenAI") + def test_reads_api_key_from_env(self, mock_openai_cls): + with patch.dict(os.environ, {"MINIMAX_API_KEY": "env-key"}, clear=False): + evaluator = GPTEvaluation(provider="minimax") + mock_openai_cls.assert_called_once_with( + api_key="env-key", + base_url="https://api.minimax.io/v1", + ) + + +# --------------------------------------------------------------------------- +# GPTEvaluation call_chatgpt +# --------------------------------------------------------------------------- + + +class TestCallChatGPT: + """Tests for the call_chatgpt method.""" + + @patch("gpt_eval.OpenAI") + def test_uses_default_model(self, mock_openai_cls): + mock_client = MagicMock() + mock_openai_cls.return_value = mock_client + + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=MagicMock(content="75"))] + mock_response.usage = MagicMock(total_tokens=100) + mock_client.chat.completions.create.return_value = mock_response + + evaluator = GPTEvaluation(provider="minimax", api_key="test-key") + reply, tokens = evaluator.call_chatgpt([{"role": "user", "content": "test"}]) + + call_args = mock_client.chat.completions.create.call_args + assert call_args.kwargs["model"] == "MiniMax-M2.7" + assert reply == "75" + assert tokens == 100 + + @patch("gpt_eval.OpenAI") + def test_minimax_temperature_clamped(self, mock_openai_cls): + mock_client = MagicMock() + mock_openai_cls.return_value = mock_client + + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=MagicMock(content="80"))] + mock_response.usage = MagicMock(total_tokens=50) + mock_client.chat.completions.create.return_value = mock_response + + evaluator = GPTEvaluation(provider="minimax", api_key="test-key") + evaluator.call_chatgpt([{"role": "user", "content": "test"}]) + + call_args = mock_client.chat.completions.create.call_args + assert call_args.kwargs["temperature"] == 0.6 # 0.6 is already valid + + +# --------------------------------------------------------------------------- +# GPTEvaluation forward +# --------------------------------------------------------------------------- + + +class TestForward: + """Tests for the forward method.""" + + @patch("gpt_eval.OpenAI") + def test_forward_returns_score(self, mock_openai_cls): + mock_client = MagicMock() + mock_openai_cls.return_value = mock_client + + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=MagicMock(content="82"))] + mock_response.usage = MagicMock(total_tokens=200) + mock_client.chat.completions.create.return_value = mock_response + + evaluator = GPTEvaluation(provider="openai", api_key="test-key") + result = evaluator.forward(("my answer", "correct answer")) + assert result == "82" + + @patch("gpt_eval.OpenAI") + def test_prepare_chatgpt_message(self, mock_openai_cls): + evaluator = GPTEvaluation(provider="openai", api_key="test-key") + messages = evaluator.prepare_chatgpt_message("test prompt") + assert len(messages) == 2 + assert messages[0]["role"] == "system" + assert messages[1]["role"] == "user" + assert messages[1]["content"] == "test prompt" + + +# --------------------------------------------------------------------------- +# PROVIDER_CONFIGS integrity +# --------------------------------------------------------------------------- + + +class TestProviderConfigs: + """Tests for PROVIDER_CONFIGS structure.""" + + def test_openai_config(self): + cfg = PROVIDER_CONFIGS["openai"] + assert cfg["env_key"] == "OPENAI_API_KEY" + assert cfg["default_model"] == "gpt-3.5-turbo" + + def test_minimax_config(self): + cfg = PROVIDER_CONFIGS["minimax"] + assert cfg["env_key"] == "MINIMAX_API_KEY" + assert cfg["base_url"] == "https://api.minimax.io/v1" + assert cfg["default_model"] == "MiniMax-M2.7" + + def test_minimax_base_url_not_minimax_chat(self): + """Ensure we don't use the deprecated api.minimax.chat domain.""" + assert "minimax.chat" not in PROVIDER_CONFIGS["minimax"]["base_url"] + + +# --------------------------------------------------------------------------- +# Think-tag stripping +# --------------------------------------------------------------------------- + + +class TestStripThinkTags: + """Tests for _strip_think_tags helper.""" + + def test_strips_think_block(self): + content = "\nSome reasoning here.\n\n\n78" + assert _strip_think_tags(content) == "78" + + def test_no_think_tags(self): + assert _strip_think_tags("82") == "82" + + def test_none_input(self): + assert _strip_think_tags(None) is None + + def test_empty_string(self): + assert _strip_think_tags("") == "" + + def test_multiline_think(self): + content = "\nLine 1\nLine 2\n\n\nResult text" + assert _strip_think_tags(content) == "Result text" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/challenge/tests/test_integration.py b/challenge/tests/test_integration.py new file mode 100644 index 00000000..adb57124 --- /dev/null +++ b/challenge/tests/test_integration.py @@ -0,0 +1,61 @@ +"""Integration tests for GPTEvaluation with live LLM APIs. + +These tests are skipped unless the corresponding API key is set. +Run with: pytest challenge/tests/test_integration.py -v +""" + +import os +import pytest + +import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from gpt_eval import GPTEvaluation + +MINIMAX_API_KEY = os.environ.get("MINIMAX_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + +SAMPLE_DATA = ( + "The ego vehicle should slow down because there is a pedestrian crossing.", + "The ego vehicle should decelerate and yield to the pedestrian on the crosswalk.", +) + + +@pytest.mark.skipif(not MINIMAX_API_KEY, reason="MINIMAX_API_KEY not set") +class TestMiniMaxIntegration: + """Live integration tests against MiniMax API.""" + + def test_basic_evaluation(self): + evaluator = GPTEvaluation(provider="minimax") + result = evaluator.forward(SAMPLE_DATA) + score = float(result.strip()) + assert 0 <= score <= 100, f"Score out of range: {score}" + + def test_call_chatgpt_returns_content(self): + evaluator = GPTEvaluation(provider="minimax") + messages = [{"role": "user", "content": "Reply with the number 42 only."}] + reply, tokens = evaluator.call_chatgpt(messages, max_tokens=200) + assert reply is not None + assert len(reply) > 0 + assert tokens > 0 + + def test_highspeed_model(self): + evaluator = GPTEvaluation(provider="minimax", model="MiniMax-M2.7-highspeed") + result = evaluator.forward(SAMPLE_DATA) + score = float(result.strip()) + assert 0 <= score <= 100 + + +@pytest.mark.skipif(not OPENAI_API_KEY, reason="OPENAI_API_KEY not set") +class TestOpenAIIntegration: + """Live integration tests against OpenAI API.""" + + def test_basic_evaluation(self): + evaluator = GPTEvaluation(provider="openai") + result = evaluator.forward(SAMPLE_DATA) + score = float(result.strip()) + assert 0 <= score <= 100, f"Score out of range: {score}" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])