hplt-project · ZJaume · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.sif
+*.swp
+__pycache__/
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,7 @@
 #No GPU support needed by now...
 #from nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
-FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
+#FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
+FROM ubuntu:22.04@sha256:1ec65b2719518e27d4d25f104d93f9fac60dc437f81452302406825c46fcc9cb
 
 
 ENV BINPATH=/opt/bin
@@ -28,7 +29,8 @@ RUN apt-get update && \
     apt-get install -y wget unzip joe gcc libboost-all-dev cmake && \ 
     apt-get install -y python3.10 python3-dev python3.10-dev  python3-pip  python3.10-venv && \
     apt-get install -y git build-essential autoconf autopoint libtool parallel &&\
-    apt-get install -y hunspell libhunspell-dev jq zstd curl cuda-nvvm-12-2 gawk
+    apt-get install -y hunspell libhunspell-dev jq zstd curl gawk time
+    # apt-get install -y cuda-nvvm-12-2
 
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --default-toolchain=1.77.2
 ENV PATH="/root/.cargo/bin:${PATH}"
@@ -76,13 +78,13 @@ RUN . /work/venvs/venv-bc/bin/activate && \
     python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \    
     python3.10 -m pip install bicleaner==0.17.2 && python3.10 -m pip install numpy==1.26.4 && deactivate
 
-RUN . /work/venvs/venv-bcai/bin/activate && \
-    python3.10 -m pip install -U pip  && \
-    python3.10 -m pip install -U wheel && \
-    python3.10 -m pip install -U setuptools && \
-    python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\   
-    python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \
-    python3.10 -m pip install bicleaner-ai==3.1.0 && deactivate
+# RUN . /work/venvs/venv-bcai/bin/activate && \
+#     python3.10 -m pip install -U pip  && \
+#     python3.10 -m pip install -U wheel && \
+#     python3.10 -m pip install -U setuptools && \
+#     python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\   
+#     python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \
+#     python3.10 -m pip install bicleaner-ai==3.1.0 && deactivate
 
 RUN . /work/venvs/venv-bnlp/bin/activate && \
     python3.10 -m pip install -U pip && \
@@ -94,21 +96,22 @@ RUN . /work/venvs/venv-bnlp/bin/activate && \
     python3.10 -m pip install bnlp-toolkit==4.0.3 &&\
     echo "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords');" | python3.10
 
-RUN . /work/venvs/venv-rl/bin/activate && \
-    python3.10 -m pip install -U pip  && \
-    python3.10 -m pip install -U wheel && \
-    python3.10 -m pip install -U setuptools && \
-    python3.10 -m pip install -r /work/deployment/requirements-rl.txt
+# RUN . /work/venvs/venv-rl/bin/activate && \
+#     python3.10 -m pip install -U pip  && \
+#     python3.10 -m pip install -U wheel && \
+#     python3.10 -m pip install -U setuptools && \
+#     python3.10 -m pip install -r /work/deployment/requirements-rl.txt
 
-RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download TurkuNLP/web-register-classification-multilingual
-RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download FacebookAI/xlm-roberta-large
-RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download nvidia/multilingual-domain-classifier
+# RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download TurkuNLP/web-register-classification-multilingual
+# RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download FacebookAI/xlm-roberta-large
+# RUN . /work/venvs/venv-rl/bin/activate &&   huggingface-cli download nvidia/multilingual-domain-classifier
 
 
 
 RUN python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\
     python3.10 -m pip install -r /work/deployment/requirements.txt &&\
-    echo "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords');" | python3.10
+    echo "import nltk; nltk.download('punkt',download_dir='/usr/share/nltk_data'); nltk.download('punkt_tab',download_dir='/usr/share/nltk_data'); nltk.download('stopwords',download_dir='/usr/share/nltk_data');" | python3.10
+RUN fastspell-download
 
 
 

diff --git a/data-analytics.def b/data-analytics.def
@@ -0,0 +1,2 @@
+bootstrap: docker-daemon
+from: data-analytics-lumi:latest
diff --git a/deployment/requirements.txt b/deployment/requirements.txt
@@ -32,4 +32,5 @@ igbo-text==0.1.3
 tldextract==5.1.3
 heliport==0.8.1
 pandas==2.3.0
-fastparquet==2024.11.0
+fastparquet==2024.11.0
+orjson==3.11
diff --git a/scripts/lumi/01.map.sh b/scripts/lumi/01.map.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+set -euo pipefail
+
+#IN:  a zstd file, a language code
+#OUT: a tsv ,  a docproc and a proc files
+
+srclang=$1
+format=$2
+inputfile=$3
+outfile="$OUT_DIR/$(basename $inputfile)"
+srclang2=$(python3 /work/scripts/lang_equivalent.py $srclang)
+
+if [ -z "${HQ_CPUS+x}" ]; then
+    echo HQ_CPUS=$HQ_CPUS >&2
+    JOBS=$(echo $HQ_CPUS | tr -cd ',' | wc -c)
+else
+    JOBS=$(nproc)
+fi
+echo Running with $JOBS cpus >&2
+JOBS_LOW=$((JOBS/2))
+JOBS_LOW=$(($JOBS_LOW>1 ? $JOBS_LOW : 1))
+echo Running with $JOBS_LOW cpus for high memory processes >&2
+
+if [ -f $outfile.langids.zst ] && [ $(du $outfile.langids.zst |cut -f1) -gt 13 ]; then
+    echo "Task already completed, skipping..." >&2
+    exit 0
+fi
+
+echo "##### Read samples #####" >&2
+zstdcat $inputfile \
+| shuf -n20 \
+| jq .text \
+| zstdmt \
+>$outfile.sample.zst
+
+echo "##### Read documents #####" >&2
+zstdcat $inputfile \
+| parallel --pipe -j$JOBS_LOW --block 20M --halt now,fail=1 \
+    python3 /work/scripts/readdocuments.py $srclang --format $format \
+| zstdmt -10 \
+>$outfile.docproc.zst \
+|| {
+    echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
+    exit 1
+}
+
+echo "#### Read registerlabels #####" >&2
+has_registers=$(zstdcat $inputfile | tail -1 | jq '."web-register" != null')
+echo "Has registers: $has_registers" >&2
+if [ "$format" == "hplt-v3" ] && [ "$has_registers" == "true" ]; then
+    zstdcat $inputfile \
+    | python3 /work/scripts/reuse-registerlabels.py \
+    | zstdmt \
+    >$outfile.rl.zst \
+    || {
+        echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
+        exit 1
+    }
+fi
+
+echo "##### Read corpus #####" >&2
+zstdcat $outfile.docproc.zst \
+| cut -f 7 \
+| awk 'length() == 0{next;} {print;}' \
+| zstdmt -10 \
+> $outfile.tsv.zst
+
+if [ "$srclang" = "bn" ]  || [ "$srclang" = "ben" ]; then
+    source /work/venvs/venv-bnlp/bin/activate
+fi
+zstdcat $outfile.tsv.zst \
+| parallel --pipe -j$JOBS_LOW --block 10M --halt now,fail=1 \
+    python3 /work/scripts/readcorpus_mono.py $srclang $srclang2 --quiet \
+| zstdmt -10 \
+>$outfile.proc.zst \
+|| {
+    echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
+    exit 1
+}
+
+if [ "$srclang" = "bn" ]  || [ "$srclang" = "ben" ]; then
+    deactivate
+fi
+
+echo "##### Monocleaner #####" >&2
+source /work/venvs/venv-mc/bin/activate
+zstdcat $outfile.tsv.zst \
+| /work/preprocess/build/bin/cache -k 1 parallel -k -j $JOBS --block 10M --pipe --halt now,fail=1 \
+    monocleaner-hardrules \
+    --score_only --annotated_output \
+    --run_all_rules --disable_lang_ident \
+    $srclang - - \
+| zstdmt \
+> $outfile.hardrules.zst \
+|| {
+    echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
+    exit 1
+}
+deactivate
+
+echo "##### Fastspell #####" >&2
+zstdcat $outfile.tsv.zst \
+| parallel --pipe -j$JOBS --block 20M --halt now,fail=1 \
+    fastspell --aggr $srclang --quiet \
+| cut -f2 \
+| zstdmt \
+>$outfile.langids.zst.tmp \
+|| {
+    echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
+    exit 1
+}
+mv $outfile.langids.zst.tmp $outfile.langids.zst
+
+rm $outfile.tsv.zst
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		bootstrap: docker-daemon
		from: data-analytics-lumi:latest