[build] update dockerfile for deepep, deep_gemm and ci proxy speed fix

CyCle1024 · CyCle1024 · commit e00ff2b6aa6a · 2026-04-02T15:21:22.000+08:00
diff --git a/.dev_scripts/xtuner_rl_path.pth b/.dev_scripts/xtuner_rl_path.pth
@@ -0,0 +1,25 @@
+import os
+import sys
+import warnings
+
+if os.getenv('XTUNER_USE_LMDEPLOY').lower() in ['1', 'on', 'true']:
+    lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
+    if lmdeploy_envs_dir not in sys.path:
+        sys.path.insert(0, lmdeploy_envs_dir)
+        warnings.warn(
+            f"XTUNER_USE_LMDEPLOY is set to true. Injected {lmdeploy_envs_dir} into sys.path for lmdeploy imports."
+        )
+
+elif os.getenv('XTUNER_USE_SGLANG').lower() in ['1', 'on', 'true']:
+    sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
+    if sglang_envs_dir not in sys.path:
+        sys.path.insert(0, sglang_envs_dir)
+        warnings.warn(
+            f"XTUNER_USE_SGLANG is set to true. Injected {sglang_envs_dir} into sys.path for sglang imports."
+        )
+
+# else:
+#     warnings.warn(
+#         "Neither XTUNER_USE_LMDEPLOY nor XTUNER_USE_SGLANG is set to true. No custom paths will be injected."
+#     )    
+
diff --git a/Dockerfile b/Dockerfile
@@ -2,39 +2,38 @@
 # builder
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.03-py3
 
-## build args
+## build base env
 FROM ${BASE_IMAGE} AS setup_env
 
-ARG TORCH_VERSION
 ARG PPA_SOURCE
-
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
     apt update && \
     apt install --no-install-recommends ca-certificates -y && \
     apt install --no-install-recommends bc wget -y && \
     apt install --no-install-recommends build-essential sudo -y && \
     apt install --no-install-recommends git curl pkg-config tree unzip tmux \
     openssh-server openssh-client dnsutils iproute2 lsof net-tools zsh rclone \
-    iputils-ping telnet netcat-openbsd -y && \
+    iputils-ping telnet netcat-openbsd htop bubblewrap socat -y && \
     apt clean && rm -rf /var/lib/apt/lists/*
 
 RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
-RUN pip install pystack py-spy --no-cache-dir
+RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
 RUN git config --system --add safe.directory "*"
 
+# torch
+ARG TORCH_VERSION
+ARG PYTORCH_WHEELS_URL
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    --mount=type=secret,id=NO_PROXY,env=no_proxy \
     if [ -n "${TORCH_VERSION}" ]; then \
         pip install torchvision torch==${TORCH_VERSION} \
-        --index-url https://download.pytorch.org/whl/cu128 \
-        --extra-index-url https://download.pytorch.org/whl/cu126 \
+        -i ${PYTORCH_WHEELS_URL}/cu128 \
+        --extra-index-url ${PYTORCH_WHEELS_URL}/cu126 \
         --no-cache-dir; \
     fi
-
 # set reasonable default for CUDA architectures when building ngc image
-ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
-
-RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
+ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
 
 ARG FLASH_ATTN_DIR=/tmp/flash-attn
 ARG CODESPACE=/root/codespace
@@ -56,6 +55,9 @@ ARG CODESPACE
 ARG FLASH_ATTN_DIR
 ARG FLASH_ATTN3_DIR
 ARG FLASH_ATTN_URL
+# force hopper for now, you change it throught build args
+ARG FLASH_ATTN_CUDA_ARCHS="90"
+ARG FLASH_ATTENTION_DISABLE_SM80="TRUE"
 
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
     git clone $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
@@ -119,42 +121,41 @@ WORKDIR ${CODESPACE}/causal-conv1d
 
 RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip wheel -w ${CAUSAL_CONV1D_DIR} -v --no-deps --no-build-isolation .
 
-# pypi install nvshmem and compile deepep
+# compile nvshmem and deepep
 FROM setup_env AS deep_ep
 
 ARG CODESPACE
 ARG DEEP_EP_DIR
 ARG DEEP_EP_URL
-# build sm90 and sm100 for deep_ep for now
-ARG TORCH_CUDA_ARCH_LIST="9.0 10.0"
 
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+#     curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+#     tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+#     cd ${CODESPACE}/nvshmem_src && \
+#     NVSHMEM_SHMEM_SUPPORT=0 \
+#     NVSHMEM_UCX_SUPPORT=0 \
+#     NVSHMEM_USE_NCCL=0 \
+#     NVSHMEM_MPI_SUPPORT=0 \
+#     NVSHMEM_IBGDA_SUPPORT=1 \
+#     NVSHMEM_USE_GDRCOPY=0 \
+#     NVSHMEM_PMIX_SUPPORT=0 \
+#     NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+#     NVSHMEM_BUILD_TESTS=0 \
+#     NVSHMEM_BUILD_EXAMPLES=0 \
+#     NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
+#     NVSHMEM_BUILD_TXZ_PACKAGE=0 \
+#     NVSHMEM_BUILD_PYTHON_LIB=OFF \
+#     cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
+#     cmake --build build --target install --parallel 32 && \
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
-    tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
-    cd ${CODESPACE}/nvshmem_src && \
-    NVSHMEM_SHMEM_SUPPORT=0 \
-    NVSHMEM_UCX_SUPPORT=0 \
-    NVSHMEM_USE_NCCL=0 \
-    NVSHMEM_MPI_SUPPORT=0 \
-    NVSHMEM_IBGDA_SUPPORT=1 \
-    NVSHMEM_USE_GDRCOPY=0 \
-    NVSHMEM_PMIX_SUPPORT=0 \
-    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-    NVSHMEM_BUILD_TESTS=0 \
-    NVSHMEM_BUILD_EXAMPLES=0 \
-    NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
-    NVSHMEM_BUILD_TXZ_PACKAGE=0 \
-    NVSHMEM_BUILD_PYTHON_LIB=OFF \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
-    cmake --build build --target install --parallel 32 && \
     cd ${CODESPACE} && git clone $(echo ${DEEP_EP_URL} | cut -d '@' -f 1) && \
     cd ${CODESPACE}/DeepEP && \
     git checkout $(echo ${DEEP_EP_URL} | cut -d '@' -f 2) && \
     git submodule update --init --recursive --force
 
 WORKDIR ${CODESPACE}/DeepEP
 
-RUN NVSHMEM_DIR=${NVSHMEM_PREFIX} pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
+RUN pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
 
 # compile deep_gemm
 FROM setup_env AS deep_gemm
@@ -192,7 +193,7 @@ COPY --from=flash_attn ${FLASH_ATTN_DIR} ${FLASH_ATTN_DIR}
 COPY --from=adaptive_gemm ${ADAPTIVE_GEMM_DIR} ${ADAPTIVE_GEMM_DIR}
 COPY --from=grouped_gemm ${GROUPED_GEMM_DIR} ${GROUPED_GEMM_DIR}
 COPY --from=deep_ep ${DEEP_EP_DIR} ${DEEP_EP_DIR}
-COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
+# COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
 COPY --from=deep_gemm ${DEEP_GEMM_DIR} ${DEEP_GEMM_DIR}
 COPY --from=causal_conv1d ${CAUSAL_CONV1D_DIR} ${CAUSAL_CONV1D_DIR}
 
@@ -204,11 +205,16 @@ RUN unzip ${DEEP_EP_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${DEEP_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${CAUSAL_CONV1D_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 
-# install sglang and its runtime requirements
+ARG DEFAULT_PYPI_URL
+
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN pip install pystack py-spy --no-cache-dir -i ${DEFAULT_PYPI_URL}
 
+# install sglang and its runtime requirements
 ENV XTUNER_SGLANG_ENVS_DIR=/envs/sglang
 
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN \
    pip install --target ${XTUNER_SGLANG_ENVS_DIR} \
    sglang==0.5.9 sgl-kernel==0.3.21 \
    apache-tvm-ffi==0.1.9 \
@@ -229,7 +235,7 @@ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
    torchao==0.9.0 \
    torchaudio==2.9.1 \
    torchcodec==0.8.0 \
-   xgrammar==0.1.27 \
+   xgrammar==0.1.32 \
    smg-grpc-proto==0.4.5 \
    grpcio==1.78.1 \
    grpcio-reflection==1.78.1 \
@@ -250,54 +256,60 @@ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
    llguidance==0.7.11 blobfile==3.0.0 \
    pybase64 orjson uvloop setproctitle msgspec \
    compressed_tensors python-multipart \
-   hf_transfer interegular --no-cache-dir --no-deps
+   hf_transfer interegular --no-cache-dir --no-deps -i ${DEFAULT_PYPI_URL}
 
 # install lmdeploy and its missing runtime requirements
 ARG LMDEPLOY_VERSION
 ARG LMDEPLOY_URL
 ENV XTUNER_LMDEPLOY_ENVS_DIR=/envs/lmdeploy
 
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+ARG LMDEPLOY_WHEELS=https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp312-cp312-manylinux2014_x86_64.whl
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    --mount=type=secret,id=NO_PROXY,env=no_proxy \
     pip install fastapi fire openai outlines \
-        partial_json_parser ray[default] shortuuid uvicorn \
-        'pydantic>2' openai_harmony dlblas --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-cache-dir && \
+        partial_json_parser 'ray[default]<3' shortuuid uvicorn \
+        'pydantic>2' openai_harmony dlblas --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-cache-dir -i ${DEFAULT_PYPI_URL} && \
     if [ -n "${LMDEPLOY_VERSION}" ]; then \
-        pip install lmdeploy==${LMDEPLOY_VERSION} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir; \
+        # pip install lmdeploy==${LMDEPLOY_VERSION} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
+        echo pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
+        pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     else \
         git clone $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
         cd ${CODESPACE}/lmdeploy && \
         git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
-        pip install . -v --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir; \
+        pip install . -v --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     fi
 
 ## install xtuner
 ARG XTUNER_URL
 ARG XTUNER_COMMIT
-#RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
 #   git clone $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
 #   cd ${CODESPACE}/xtuner && \
 #   git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2) 
 COPY . ${CODESPACE}/xtuner
 
 WORKDIR ${CODESPACE}/xtuner
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    pip install .[all] -v --no-cache-dir
 
 # Install custom .pth file for conditional lmdeploy and sglang path injection
 RUN cp .dev_scripts/xtuner_rl_path.pth ${PYTHON_SITE_PACKAGE_PATH}/xtuner_rl_path.pth
 
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN pip install .[all] -v --no-cache-dir -i ${DEFAULT_PYPI_URL}
+
 WORKDIR ${CODESPACE}
 
 # nccl update for torch 2.6.0
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
-        pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir; \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
+        pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     fi
 
 # cudnn update for torch 2.9.1
-RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
-    if [ "x${TORCH_VERSION}" = "x2.9.1" ]; then \
-        pip install nvidia-cudnn-cu12==9.15.1.9 --no-cache-dir; \
+# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+RUN if [ "x${TORCH_VERSION}" = "x2.9.1" ]; then \
+        pip install nvidia-cudnn-cu12==9.15.1.9 --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
     fi
 
 # setup sysctl
diff --git a/image_build.sh b/image_build.sh
@@ -10,22 +10,27 @@ export DEEP_EP_URL=https://github.com/deepseek-ai/DeepEP@9af0e0d0e74f3577af1979c
 export DEEP_GEMM_URL=https://github.com/deepseek-ai/DeepGEMM@c9f8b34dcdacc20aa746b786f983492c51072870 # v2.1.1.post3
 export CAUSAL_CONV1D_URL=https://github.com/Dao-AILab/causal-conv1d@da6dbaa9fd5a919967f14d3fd031da1288ad5025 # v1.6.0
 
-export TORCH_VERSION=${TORCH_VERSION:-"2.8.0"}
-export LMDEPLOY_VERSION="0.11.0"
-# export LMDEPLOY_URL=https://github.com/InternLM/lmdeploy@a9a24fbd8985374cb01ecb6021d1ce9668253c9c
+export TORCH_VERSION=${TORCH_VERSION:-"2.9.0"}
+export LMDEPLOY_VERSION="0.12.2"
+# export LMDEPLOY_URL=https://github.com/InternLM/lmdeploy@9a50f1f4eaf1e4fbe45892bc8017a7359237160c
 export PPA_SOURCE="https://mirrors.aliyun.com"
-export SGLANG_VERSION="0.5.3"
+export DEFAULT_PYPI_URL=${DEFAULT_PYPI_URL:-"https://mirrors.aliyun.com/pypi/simple"}
+# mirror https://download.pytorch.org/whl
+export PYTORCH_WHEELS_URL=${PYTORCH_WHEELS_URL:-"https://download.pytorch.org/whl"}
 
 image_name=${IMAGE_NAME:-"xtuner"}
 image_tag=${IMAGE_TAG:-"pt$(echo ${TORCH_VERSION} | awk -F. '{print $1$2}')_$(date +%Y%m%d)_${XTUNER_COMMIT:0:7}"}
 
 docker build . \
   -t "$image_name:$image_tag" \
   --secret id=HTTPS_PROXY \
+  --secret id=NO_PROXY \
   --build-arg TORCH_VERSION=$TORCH_VERSION\
   --build-arg BASE_IMAGE=$BASE_IMAGE \
-  --build-arg PPA_SOURCE=$PPA_SOURCE \
-  --build-arg ADAPTIVE_GEMM_URL=$ADAPTIVE_GEMM_URL \
+  --build-arg PPA_SOURCE="$PPA_SOURCE" \
+  --build-arg DEFAULT_PYPI_URL="$DEFAULT_PYPI_URL" \
+  --build-arg PYTORCH_WHEELS_URL="$PYTORCH_WHEELS_URL" \
+  --build-arg ADAPTIVE_GEMM_URL="$ADAPTIVE_GEMM_URL" \
   --build-arg FLASH_ATTN_URL=$FLASH_ATTN_URL \
   --build-arg GROUPED_GEMM_URL=$GROUPED_GEMM_URL \
   --build-arg CAUSAL_CONV1D_URL=$CAUSAL_CONV1D_URL \
@@ -34,8 +39,6 @@ docker build . \
   --build-arg XTUNER_URL=$XTUNER_URL \
   --build-arg XTUNER_COMMIT=$XTUNER_COMMIT \
   --build-arg LMDEPLOY_VERSION=$LMDEPLOY_VERSION \
-  --build-arg LMDEPLOY_URL=$LMDEPLOY_URL \
-  --build-arg SGLANG_VERSION=$SGLANG_VERSION \
   --progress=plain \
   --label "BASE_IMAGE=$BASE_IMAGE" \
   --label "XTUNER_URL=${XTUNER_URL/@/\/tree\/}" \
@@ -46,5 +49,4 @@ docker build . \
   --label "CAUSAL_CONV1D_URL=${CAUSAL_CONV1D_URL/@/\/tree\/}" \
   --label "DEEP_EP_URL=${DEEP_EP_URL/@/\/tree\/}" \
   --label "DEEP_GEMM_URL=${DEEP_GEMM_URL/@/\/tree\/}" \
-  --label "LMDEPLOY_VERSION=$LMDEPLOY_VERSION" \
-  --label "SGLANG_VERSION=$SGLANG_VERSION"
+  --label "LMDEPLOY_VERSION=$LMDEPLOY_VERSION"