OpenDriveLab · octo-patch · Mar 25, 2026
diff --git a/README.md b/README.md
@@ -121,11 +121,32 @@ https://github.com/OpenDriveLab/DriveLM-new/assets/75412366/78c32442-73c8-4f1d-a
 
 
 ## Getting Started <a name="gettingstarted"></a>
-To get started with DriveLM: 
+To get started with DriveLM:
 - [Prepare DriveLM-nuScenes](/docs/data_prep_nus.md)
 - [Challenge devkit](/challenge/)
 - [More content coming soon](#todolist)
 
+### LLM Provider for Evaluation
+
+The evaluation pipeline uses an LLM to compute GPT-score. By default it uses OpenAI, but [MiniMax](https://platform.minimax.io) is also supported as an alternative provider.
+
+| Provider | Env Variable | Default Model | API Docs |
+|----------|-------------|---------------|----------|
+| OpenAI | `OPENAI_API_KEY` | `gpt-3.5-turbo` | [docs](https://platform.openai.com/docs) |
+| MiniMax | `MINIMAX_API_KEY` | `MiniMax-M2.7` | [docs](https://platform.minimax.io/docs/api-reference/text-openai-api) |
+
+```bash
+# Using OpenAI (default)
+export OPENAI_API_KEY="your-openai-key"
+python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json
+
+# Using MiniMax
+export MINIMAX_API_KEY="your-minimax-key"
+python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json --provider minimax
+
+# Auto-detect: if MINIMAX_API_KEY is set (and OPENAI_API_KEY is not), MiniMax is used automatically
+```
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 

diff --git a/challenge/evaluation.py b/challenge/evaluation.py
@@ -12,9 +12,9 @@
 
 
 class evaluation_suit():
-    def __init__(self):
+    def __init__(self, provider=None, model=None):
         self.language_eval = language_evaluation.CocoEvaluator(coco_types=["BLEU", "ROUGE_L", "CIDEr"])
-        self.chatgpt_eval = GPTEvaluation()
+        self.chatgpt_eval = GPTEvaluation(provider=provider, model=model)
         self.GPT = []
         self.accuracy = {"answer": [], "GT": []}
         self.language = {"answer": [], "GT": []}
@@ -153,6 +153,10 @@ def evaluation(self):
     parser = argparse.ArgumentParser(description='Evaluation')
     parser.add_argument('--root_path1', type=str, default="./llama-adapter-DriveLM.json", help='path to prediction file')
     parser.add_argument('--root_path2', type=str, default="./test_v1.json", help='path to test file')
+    parser.add_argument('--provider', type=str, default=None, choices=["openai", "minimax"],
+                        help='LLM provider for GPT-score evaluation (default: auto-detect from env)')
+    parser.add_argument('--model', type=str, default=None,
+                        help='Model name override for GPT-score evaluation')
     args = parser.parse_args()
 
     with open(args.root_path1, 'r') as f :#, \    
@@ -162,7 +166,7 @@ def evaluation(self):
     with open(args.root_path2, 'r') as f:
         test_file = json.load(f)
 
-    evaluation = evaluation_suit()
+    evaluation = evaluation_suit(provider=args.provider, model=args.model)
     for scene_id in test_file.keys():
         scene_data = test_file[scene_id]['key_frames']
 

diff --git a/challenge/gpt_eval.py b/challenge/gpt_eval.py
@@ -1,37 +1,117 @@
-import pickle
-import pdb
+import os
+import re
 import numpy as np
-import torch
 import json
 import argparse
 from multiprocessing import Pool
 from openai import OpenAI
 
 
+# Supported LLM providers and their default configurations
+PROVIDER_CONFIGS = {
+    "openai": {
+        "env_key": "OPENAI_API_KEY",
+        "base_url": None,  # use OpenAI SDK default
+        "default_model": "gpt-3.5-turbo",
+    },
+    "minimax": {
+        "env_key": "MINIMAX_API_KEY",
+        "base_url": "https://api.minimax.io/v1",
+        "default_model": "MiniMax-M2.7",
+    },
+}
+
+
+def _resolve_provider():
+    """Auto-detect provider from environment variables.
+
+    Priority: EVAL_LLM_PROVIDER env var > MINIMAX_API_KEY presence > OPENAI_API_KEY presence.
+    """
+    explicit = os.environ.get("EVAL_LLM_PROVIDER", "").lower()
+    if explicit in PROVIDER_CONFIGS:
+        return explicit
+
+    if os.environ.get("MINIMAX_API_KEY"):
+        return "minimax"
+    if os.environ.get("OPENAI_API_KEY"):
+        return "openai"
+
+    return "openai"
+
+
+def _strip_think_tags(content):
+    """Strip <think>...</think> reasoning blocks from model output."""
+    if content and "<think>" in content:
+        return re.sub(r"<think>[\s\S]*?</think>\s*", "", content).strip()
+    return content
+
+
+def _clamp_temperature(temperature, provider):
+    """Clamp temperature to valid range for the provider."""
+    if provider == "minimax" and temperature is not None:
+        # MiniMax requires temperature in (0.0, 1.0]
+        return max(0.01, min(temperature, 1.0))
+    return temperature
+
+
 class GPTEvaluation:
-    def __init__(self):
-        self.client = OpenAI(api_key="you need to use your own openai key for evaluation on your local machine")
+    """LLM-based evaluation scorer supporting multiple providers.
+
+    Supported providers:
+        - ``openai``: OpenAI API (default)
+        - ``minimax``: MiniMax API (OpenAI-compatible)
 
-    def call_chatgpt(self, chatgpt_messages, max_tokens=40, model="gpt-3.5-turbo"):
+    The provider is chosen by ``provider`` argument, the ``EVAL_LLM_PROVIDER``
+    environment variable, or auto-detected from available API keys.
+    """
+
+    def __init__(self, provider=None, api_key=None, base_url=None, model=None):
+        self.provider = provider or _resolve_provider()
+        if self.provider not in PROVIDER_CONFIGS:
+            raise ValueError(
+                f"Unsupported provider '{self.provider}'. "
+                f"Choose from: {', '.join(PROVIDER_CONFIGS)}"
+            )
+
+        cfg = PROVIDER_CONFIGS[self.provider]
+        resolved_key = api_key or os.environ.get(cfg["env_key"])
+        if not resolved_key:
+            raise ValueError(
+                f"API key not found. Set {cfg['env_key']} environment variable "
+                f"or pass api_key to GPTEvaluation()."
+            )
+
+        resolved_base_url = base_url or cfg["base_url"]
+        self.default_model = model or cfg["default_model"]
+
+        client_kwargs = {"api_key": resolved_key}
+        if resolved_base_url:
+            client_kwargs["base_url"] = resolved_base_url
+        self.client = OpenAI(**client_kwargs)
+
+    def call_chatgpt(self, chatgpt_messages, max_tokens=40, model=None):
+        model = model or self.default_model
+        temperature = _clamp_temperature(0.6, self.provider)
         response = self.client.chat.completions.create(
-            model=model, messages=chatgpt_messages, temperature=0.6, max_tokens=max_tokens
+            model=model, messages=chatgpt_messages, temperature=temperature, max_tokens=max_tokens
         )
         reply = response.choices[0].message.content
+        reply = _strip_think_tags(reply)
         total_tokens = response.usage.total_tokens
         return reply, total_tokens
-    
+
     def prepare_chatgpt_message(self, prompt):
         system_message = "an evaluator who rates my answer based on the correct answer"
         messages = [{"role": "system", "content": system_message}]
         messages.append({"role": "user", "content": "{}".format(prompt)})
-        
+
         return messages
-    
+
     def forward(self, data):
         answer, GT = data
         prompts = "Rate my answer based on the correct answer out of 100, with higher scores indicating that the answer is closer to the correct answer, and you should be accurate to single digits like 62, 78, 41,etc. Output the number only"
         prompts = prompts + "This is the correct answer: " + GT + "This is my answer: " + answer
-        
+
         output = ""
         messages = self.prepare_chatgpt_message(prompts)
         reply, total_tokens = self.call_chatgpt(messages, max_tokens=3000)
@@ -42,17 +122,34 @@ def forward(self, data):
         output = output[:-2]
 
         return output
-    
+
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GPT-based Evaluation")
+    parser.add_argument(
+        "--provider",
+        type=str,
+        default=None,
+        choices=list(PROVIDER_CONFIGS.keys()),
+        help="LLM provider for evaluation (default: auto-detect from env)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="Model name override (default: provider-specific)",
+    )
+    args = parser.parse_args()
+
     data = [
         ("The ego vehicle should notice the bus next, as it is the third object in the image. The bus is stopped at the intersection, and the ego vehicle should be cautious when approaching the intersection to ensure it does not collide with the bus.", "Firstly, notice <c3,CAM_FRONT_LEFT,1075.5,382.8>. The object is a traffic sign, so the ego vehicle should continue at the same speed. Secondly, notice <c2,CAM_FRONT,836.3,398.3>. The object is a traffic sign, so the ego vehicle should accelerate and continue ahead. Thirdly, notice <c1,CAM_BACK,991.7,603.0>. The object is stationary, so the ego vehicle should continue ahead at the same speed."),
         # Add more data here
     ]
 
-    eval = GPTEvaluation()
+    evaluator = GPTEvaluation(provider=args.provider, model=args.model)
+    print(f"Using provider: {evaluator.provider} (model: {evaluator.default_model})")
 
     with Pool(5) as p:  # Change the number based on your CPU cores
-        scores = p.map(eval.forward, data)
+        scores = p.map(evaluator.forward, data)
 
     print(scores)