Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,32 @@ https://github.com/OpenDriveLab/DriveLM-new/assets/75412366/78c32442-73c8-4f1d-a


## Getting Started <a name="gettingstarted"></a>
To get started with DriveLM:
To get started with DriveLM:
- [Prepare DriveLM-nuScenes](/docs/data_prep_nus.md)
- [Challenge devkit](/challenge/)
- [More content coming soon](#todolist)

### LLM Provider for Evaluation

The evaluation pipeline uses an LLM to compute GPT-score. By default it uses OpenAI, but [MiniMax](https://platform.minimax.io) is also supported as an alternative provider.

| Provider | Env Variable | Default Model | API Docs |
|----------|-------------|---------------|----------|
| OpenAI | `OPENAI_API_KEY` | `gpt-3.5-turbo` | [docs](https://platform.openai.com/docs) |
| MiniMax | `MINIMAX_API_KEY` | `MiniMax-M2.7` | [docs](https://platform.minimax.io/docs/api-reference/text-openai-api) |

```bash
# Using OpenAI (default)
export OPENAI_API_KEY="your-openai-key"
python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json

# Using MiniMax
export MINIMAX_API_KEY="your-minimax-key"
python challenge/evaluation.py --root_path1 pred.json --root_path2 test.json --provider minimax

# Auto-detect: if MINIMAX_API_KEY is set (and OPENAI_API_KEY is not), MiniMax is used automatically
```

<p align="right">(<a href="#top">back to top</a>)</p>


Expand Down
10 changes: 7 additions & 3 deletions challenge/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@


class evaluation_suit():
def __init__(self):
def __init__(self, provider=None, model=None):
self.language_eval = language_evaluation.CocoEvaluator(coco_types=["BLEU", "ROUGE_L", "CIDEr"])
self.chatgpt_eval = GPTEvaluation()
self.chatgpt_eval = GPTEvaluation(provider=provider, model=model)
self.GPT = []
self.accuracy = {"answer": [], "GT": []}
self.language = {"answer": [], "GT": []}
Expand Down Expand Up @@ -153,6 +153,10 @@ def evaluation(self):
parser = argparse.ArgumentParser(description='Evaluation')
parser.add_argument('--root_path1', type=str, default="./llama-adapter-DriveLM.json", help='path to prediction file')
parser.add_argument('--root_path2', type=str, default="./test_v1.json", help='path to test file')
parser.add_argument('--provider', type=str, default=None, choices=["openai", "minimax"],
help='LLM provider for GPT-score evaluation (default: auto-detect from env)')
parser.add_argument('--model', type=str, default=None,
help='Model name override for GPT-score evaluation')
args = parser.parse_args()

with open(args.root_path1, 'r') as f :#, \
Expand All @@ -162,7 +166,7 @@ def evaluation(self):
with open(args.root_path2, 'r') as f:
test_file = json.load(f)

evaluation = evaluation_suit()
evaluation = evaluation_suit(provider=args.provider, model=args.model)
for scene_id in test_file.keys():
scene_data = test_file[scene_id]['key_frames']

Expand Down
125 changes: 111 additions & 14 deletions challenge/gpt_eval.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,117 @@
import pickle
import pdb
import os
import re
import numpy as np
import torch
import json
import argparse
from multiprocessing import Pool
from openai import OpenAI


# Supported LLM providers and their default configurations
PROVIDER_CONFIGS = {
"openai": {
"env_key": "OPENAI_API_KEY",
"base_url": None, # use OpenAI SDK default
"default_model": "gpt-3.5-turbo",
},
"minimax": {
"env_key": "MINIMAX_API_KEY",
"base_url": "https://api.minimax.io/v1",
"default_model": "MiniMax-M2.7",
},
}


def _resolve_provider():
"""Auto-detect provider from environment variables.

Priority: EVAL_LLM_PROVIDER env var > MINIMAX_API_KEY presence > OPENAI_API_KEY presence.
"""
explicit = os.environ.get("EVAL_LLM_PROVIDER", "").lower()
if explicit in PROVIDER_CONFIGS:
return explicit

if os.environ.get("MINIMAX_API_KEY"):
return "minimax"
if os.environ.get("OPENAI_API_KEY"):
return "openai"

return "openai"


def _strip_think_tags(content):
"""Strip <think>...</think> reasoning blocks from model output."""
if content and "<think>" in content:
return re.sub(r"<think>[\s\S]*?</think>\s*", "", content).strip()
return content


def _clamp_temperature(temperature, provider):
"""Clamp temperature to valid range for the provider."""
if provider == "minimax" and temperature is not None:
# MiniMax requires temperature in (0.0, 1.0]
return max(0.01, min(temperature, 1.0))
return temperature


class GPTEvaluation:
def __init__(self):
self.client = OpenAI(api_key="you need to use your own openai key for evaluation on your local machine")
"""LLM-based evaluation scorer supporting multiple providers.

Supported providers:
- ``openai``: OpenAI API (default)
- ``minimax``: MiniMax API (OpenAI-compatible)

def call_chatgpt(self, chatgpt_messages, max_tokens=40, model="gpt-3.5-turbo"):
The provider is chosen by ``provider`` argument, the ``EVAL_LLM_PROVIDER``
environment variable, or auto-detected from available API keys.
"""

def __init__(self, provider=None, api_key=None, base_url=None, model=None):
self.provider = provider or _resolve_provider()
if self.provider not in PROVIDER_CONFIGS:
raise ValueError(
f"Unsupported provider '{self.provider}'. "
f"Choose from: {', '.join(PROVIDER_CONFIGS)}"
)

cfg = PROVIDER_CONFIGS[self.provider]
resolved_key = api_key or os.environ.get(cfg["env_key"])
if not resolved_key:
raise ValueError(
f"API key not found. Set {cfg['env_key']} environment variable "
f"or pass api_key to GPTEvaluation()."
)

resolved_base_url = base_url or cfg["base_url"]
self.default_model = model or cfg["default_model"]

client_kwargs = {"api_key": resolved_key}
if resolved_base_url:
client_kwargs["base_url"] = resolved_base_url
self.client = OpenAI(**client_kwargs)

def call_chatgpt(self, chatgpt_messages, max_tokens=40, model=None):
model = model or self.default_model
temperature = _clamp_temperature(0.6, self.provider)
response = self.client.chat.completions.create(
model=model, messages=chatgpt_messages, temperature=0.6, max_tokens=max_tokens
model=model, messages=chatgpt_messages, temperature=temperature, max_tokens=max_tokens
)
reply = response.choices[0].message.content
reply = _strip_think_tags(reply)
total_tokens = response.usage.total_tokens
return reply, total_tokens

def prepare_chatgpt_message(self, prompt):
system_message = "an evaluator who rates my answer based on the correct answer"
messages = [{"role": "system", "content": system_message}]
messages.append({"role": "user", "content": "{}".format(prompt)})

return messages

def forward(self, data):
answer, GT = data
prompts = "Rate my answer based on the correct answer out of 100, with higher scores indicating that the answer is closer to the correct answer, and you should be accurate to single digits like 62, 78, 41,etc. Output the number only"
prompts = prompts + "This is the correct answer: " + GT + "This is my answer: " + answer

output = ""
messages = self.prepare_chatgpt_message(prompts)
reply, total_tokens = self.call_chatgpt(messages, max_tokens=3000)
Expand All @@ -42,17 +122,34 @@ def forward(self, data):
output = output[:-2]

return output


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GPT-based Evaluation")
parser.add_argument(
"--provider",
type=str,
default=None,
choices=list(PROVIDER_CONFIGS.keys()),
help="LLM provider for evaluation (default: auto-detect from env)",
)
parser.add_argument(
"--model",
type=str,
default=None,
help="Model name override (default: provider-specific)",
)
args = parser.parse_args()

data = [
("The ego vehicle should notice the bus next, as it is the third object in the image. The bus is stopped at the intersection, and the ego vehicle should be cautious when approaching the intersection to ensure it does not collide with the bus.", "Firstly, notice <c3,CAM_FRONT_LEFT,1075.5,382.8>. The object is a traffic sign, so the ego vehicle should continue at the same speed. Secondly, notice <c2,CAM_FRONT,836.3,398.3>. The object is a traffic sign, so the ego vehicle should accelerate and continue ahead. Thirdly, notice <c1,CAM_BACK,991.7,603.0>. The object is stationary, so the ego vehicle should continue ahead at the same speed."),
# Add more data here
]

eval = GPTEvaluation()
evaluator = GPTEvaluation(provider=args.provider, model=args.model)
print(f"Using provider: {evaluator.provider} (model: {evaluator.default_model})")

with Pool(5) as p: # Change the number based on your CPU cores
scores = p.map(eval.forward, data)
scores = p.map(evaluator.forward, data)

print(scores)
Loading