-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy patheval_agentdam.sh
More file actions
200 lines (173 loc) · 6.15 KB
/
eval_agentdam.sh
File metadata and controls
200 lines (173 loc) · 6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/bin/bash
#SBATCH --job-name=eval_agentdam
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=12
#SBATCH --mem=100G
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --gres=gpu:1
#SBATCH --partition=gpu-vram-48gb
#SBATCH --time=12:00:00
#SBATCH --output=./slurm_logs/%j_%x.log
if [ -f .env ]; then
set -o allexport
source .env
set +o allexport
fi
export NCCL_P2P_DISABLE=1
export USE_AZURE='false';
export VLLM_ALLOW_LONG_MAX_MODEL_LEN='1';
export port=$((RANDOM % 16384 + 49152))
export model=${1:-deepseek-ai/DeepSeek-R1-Distill-Llama-8B}
export website=${2:-shopping}
export instruction_path=${3:-agentdam/configs/p_cot_id_actree_3s.json}
export ngpu=${4:-2}
export version=${5:-debug}
export IP=${6:-0.0.0.0}
export model_type=${7:-vanilla}
export DATASET=webarena
export SHOPPING="http://${IP}:7770"
export SHOPPING_ADMIN="http://${IP}:7780/admin"
export REDDIT="http://${IP}:9999"
export GITLAB="http://${IP}:8023"
export WIKIPEDIA="http://${IP}:8023"
export MAP="http://${IP}:8023"
export HOMEPAGE="http://${IP}:4993"
# Shift to remove the first four parameters
shift 7 || shift $(($#))
model_sanitized=$(echo $model | tr '/' '_')
result_dir=results_ad/${version}/${website}/${model_sanitized}/${model_type}
mkdir -p $result_dir
# Check if the model is DeepSeek-V3 or DeepSeek-R1
if [ "$model" = "deepseek-ai/DeepSeek-V3" ] || [ "$model" = "deepseek-ai/DeepSeek-R1" ]; then
export OPENAI_API_BASE="https://openrouter.ai/api/v1"
export USE_OPENROUTER="true"
else
export OPENAI_API_BASE=http://localhost:$port/v1;
export USE_OPENROUTER="false"
fi
echo "USE_AZURE: $USE_AZURE"
echo "OPENAI_API_BASE: $OPENAI_API_BASE"
echo "Model: $model"
echo "website: $website"
echo "model_type: $model_type"
echo "instruction_path: $instruction_path"
echo "number of gpus: $ngpu"
echo "result_dir: $result_dir"
echo "IP Address: $IP"
vllm_server_pid=""
seed=221097
# Check if temperature and top_p are in generation config
echo "Checking generation config for temperature and top_p..."
has_temp_and_top_p=$(uv run python -c "
try:
from transformers import GenerationConfig
try:
gen_conf = GenerationConfig.from_pretrained('$model').to_diff_dict()
has_temp = 'temperature' in gen_conf
has_top_p = 'top_p' in gen_conf
if has_temp and has_top_p:
print('both')
elif has_temp:
print('temp_only')
elif has_top_p:
print('top_p_only')
else:
print('none')
except:
print('none')
except:
print('none')
")
echo "Generation config check result: $has_temp_and_top_p"
# Skip VLLM server setup for DeepSeek models using OpenRouter
if [ "$USE_OPENROUTER" = "false" ]; then
if ! (echo > /dev/tcp/localhost/$port) 2>/dev/null; then
random_id=$(( $RANDOM % 100000 ))
echo "Server output is redirected to: $result_dir/vllm_server_logs_$random_id.out"
if [ "$has_temp_and_top_p" = "none" ] || [ "$has_temp_and_top_p" = "top_p_only" ]; then
echo "Adding temperature parameter to vllm command"
temp_param="--temperature 0.6"
else
temp_param=""
fi
if [ "$has_temp_and_top_p" = "none" ] || [ "$has_temp_and_top_p" = "temp_only" ]; then
echo "Adding top-p parameter to vllm command"
top_p_param="--top-p 0.95"
else
top_p_param=""
fi
nohup uv run vllm serve --port $port \
$model \
--dtype auto \
--seed $seed \
--api-key EMPTY \
--tensor-parallel-size $ngpu \
--gpu-memory-utilization 0.8 \
--generation-config auto \
--max-model-len 24000 \
--max-num-seqs 8 \
--trust-remote-code \
--enable-prefix-caching \
$temp_param $top_p_param > $result_dir/vllm_server_logs_$random_id.out &
vllm_server_pid=$!
echo "VLLM server started with PID: $vllm_server_pid"
else
echo "Server is already running"
# Find PID of process running on this port
vllm_server_pid=$(lsof -i :$port -t 2>/dev/null)
if [ ! -z "$vllm_server_pid" ]; then
echo "Found existing server on port $port with PID: $vllm_server_pid"
fi
fi
while ! (echo > /dev/tcp/localhost/$port) 2>/dev/null; do
sleep 15
echo "... still offline"
done
sleep 10
else
echo "Using OpenRouter API for DeepSeek model, skipping VLLM server setup"
fi
if [[ $instruction_path == *"som"* ]]; then
action_set_tag=som
observation_type=image_som
else
action_set_tag=id_accessibility_tree
observation_type=accessibility_tree
fi
echo "action_set_tag: $action_set_tag"
echo "observation_type: $observation_type"
# # SSH to EC2, reset and start dockers, then terminate connection
echo "Connecting to EC2 server with IP: $IP..."
ssh root@$IP -i ec2_ssh_key.pem "
echo \"Resetting dockers...\"
bash reset_dockers.sh $IP
echo \"Starting dockers...\"
bash start_dockers.sh $IP
echo \"Docker operations completed.\"
" || echo "SSH connection failed"
echo "SSH connection terminated."
# Set provider based on whether we're using OpenRouter
if [ "$USE_OPENROUTER" = "true" ]; then
provider_param="--provider openrouter --top_p 0.95 --temperature 0.6"
else
provider_param="--provider vllm"
fi
echo "Running prepare.sh"
bash agentdam/prepare.sh
uv run python -u agentdam/run_agentdam.py \
--instruction_path $instruction_path \
--result_dir $result_dir \
--test_config_base_dir agentdam/data/wa_format/${website}_privacy/ \
--privacy_config_path agentdam/configs/p_cot_privacy_judge_3s.json \
--privacy_config_path_reasoning agentdam/configs/p_cot_privacy_judge_3s_reasoning.json \
--model $model \
--action_set_tag $action_set_tag \
--observation_type $observation_type \
--max_steps 10 \
$provider_param \
--privacy_test \
# Kill the VLLM server if we started it or found it running and not using OpenRouter
if [ "$USE_OPENROUTER" = "false" ] && [ ! -z "$vllm_server_pid" ]; then
echo "Terminating VLLM server with PID: $vllm_server_pid"
kill -TERM $vllm_server_pid
fi