Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8099b8a
Untested files scripts for lumi
mbanon Jul 10, 2025
a55c8d4
Add files via upload
mbanon Jul 10, 2025
69fa7f1
Added script for register labels
mbanon Jul 11, 2025
42758a3
Merge branch 'lumify' of https://github.com/hplt-project/data-analyti…
mbanon Jul 11, 2025
b6de5d4
Added script for samplings
mbanon Jul 11, 2025
564023b
Analytisc with HyperQueue implementation
ZJaume Aug 19, 2025
2b6fc42
Remove map and reduce separated scripts
ZJaume Aug 19, 2025
6f5a597
Add write_yaml step
ZJaume Aug 20, 2025
82c51c0
Add missing reduce hardrules
ZJaume Aug 20, 2025
6792c62
Lumi fixes and README
ZJaume Aug 22, 2025
388f737
More README
ZJaume Aug 22, 2025
c3795df
ignore vim swap files
ZJaume Sep 9, 2025
24856af
Do not use TLDextract cache
ZJaume Sep 9, 2025
58d74f9
Write intermediate sorted ngrams to disk
ZJaume Sep 9, 2025
15a45af
Refactor job submission into a single jobdef file per lang
ZJaume Sep 9, 2025
7030097
Remove output workdir at the end if not debug
ZJaume Sep 9, 2025
d0918f8
Pre-download nltk stuff when building container
ZJaume Sep 9, 2025
1da3a2e
add orjson dep
ZJaume Sep 9, 2025
dc04325
Fix stopwords for Macedonian
ZJaume Sep 29, 2025
03b3cc9
Raise exception if tokenization fails
ZJaume Sep 29, 2025
e35ce4e
Add re-use of register labels and sample extraction
ZJaume Sep 29, 2025
ff3327b
Reduce token counts without eating all ram
ZJaume Sep 29, 2025
e7d0d12
Merge branch 'main' into lumify
ZJaume Oct 27, 2025
80bb848
Add srclang2
ZJaume Oct 27, 2025
01de7d3
Add support for finepdfs
ZJaume Oct 28, 2025
10049eb
Add memory restriction in the readme allocation example
ZJaume Nov 11, 2025
81b60ed
Fix langode matching in finepdfs
ZJaume Nov 11, 2025
b2ac79c
Avoid OOM wiht finepdfs
ZJaume Nov 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.sif
*.swp
__pycache__/
39 changes: 21 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#No GPU support needed by now...
#from nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
#FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
FROM ubuntu:22.04@sha256:1ec65b2719518e27d4d25f104d93f9fac60dc437f81452302406825c46fcc9cb


ENV BINPATH=/opt/bin
Expand Down Expand Up @@ -28,7 +29,8 @@ RUN apt-get update && \
apt-get install -y wget unzip joe gcc libboost-all-dev cmake && \
apt-get install -y python3.10 python3-dev python3.10-dev python3-pip python3.10-venv && \
apt-get install -y git build-essential autoconf autopoint libtool parallel &&\
apt-get install -y hunspell libhunspell-dev jq zstd curl cuda-nvvm-12-2 gawk
apt-get install -y hunspell libhunspell-dev jq zstd curl gawk time
# apt-get install -y cuda-nvvm-12-2

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --default-toolchain=1.77.2
ENV PATH="/root/.cargo/bin:${PATH}"
Expand Down Expand Up @@ -76,13 +78,13 @@ RUN . /work/venvs/venv-bc/bin/activate && \
python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \
python3.10 -m pip install bicleaner==0.17.2 && python3.10 -m pip install numpy==1.26.4 && deactivate

RUN . /work/venvs/venv-bcai/bin/activate && \
python3.10 -m pip install -U pip && \
python3.10 -m pip install -U wheel && \
python3.10 -m pip install -U setuptools && \
python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\
python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \
python3.10 -m pip install bicleaner-ai==3.1.0 && deactivate
# RUN . /work/venvs/venv-bcai/bin/activate && \
# python3.10 -m pip install -U pip && \
# python3.10 -m pip install -U wheel && \
# python3.10 -m pip install -U setuptools && \
# python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\
# python3.10 -m pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip && \
# python3.10 -m pip install bicleaner-ai==3.1.0 && deactivate

RUN . /work/venvs/venv-bnlp/bin/activate && \
python3.10 -m pip install -U pip && \
Expand All @@ -94,21 +96,22 @@ RUN . /work/venvs/venv-bnlp/bin/activate && \
python3.10 -m pip install bnlp-toolkit==4.0.3 &&\
echo "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords');" | python3.10

RUN . /work/venvs/venv-rl/bin/activate && \
python3.10 -m pip install -U pip && \
python3.10 -m pip install -U wheel && \
python3.10 -m pip install -U setuptools && \
python3.10 -m pip install -r /work/deployment/requirements-rl.txt
# RUN . /work/venvs/venv-rl/bin/activate && \
# python3.10 -m pip install -U pip && \
# python3.10 -m pip install -U wheel && \
# python3.10 -m pip install -U setuptools && \
# python3.10 -m pip install -r /work/deployment/requirements-rl.txt

RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download TurkuNLP/web-register-classification-multilingual
RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download FacebookAI/xlm-roberta-large
RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download nvidia/multilingual-domain-classifier
# RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download TurkuNLP/web-register-classification-multilingual
# RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download FacebookAI/xlm-roberta-large
# RUN . /work/venvs/venv-rl/bin/activate && huggingface-cli download nvidia/multilingual-domain-classifier



RUN python3.10 -m pip install git+https://github.com/MSeal/cython_hunspell@2.0.3 &&\
python3.10 -m pip install -r /work/deployment/requirements.txt &&\
echo "import nltk; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords');" | python3.10
echo "import nltk; nltk.download('punkt',download_dir='/usr/share/nltk_data'); nltk.download('punkt_tab',download_dir='/usr/share/nltk_data'); nltk.download('stopwords',download_dir='/usr/share/nltk_data');" | python3.10
RUN fastspell-download



Expand Down
2 changes: 2 additions & 0 deletions data-analytics.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
bootstrap: docker-daemon
from: data-analytics-lumi:latest
3 changes: 2 additions & 1 deletion deployment/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ igbo-text==0.1.3
tldextract==5.1.3
heliport==0.8.1
pandas==2.3.0
fastparquet==2024.11.0
fastparquet==2024.11.0
orjson==3.11
114 changes: 114 additions & 0 deletions scripts/lumi/01.map.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash
set -euo pipefail

#IN: a zstd file, a language code
#OUT: a tsv , a docproc and a proc files

srclang=$1
format=$2
inputfile=$3
outfile="$OUT_DIR/$(basename $inputfile)"
srclang2=$(python3 /work/scripts/lang_equivalent.py $srclang)

if [ -z "${HQ_CPUS+x}" ]; then
echo HQ_CPUS=$HQ_CPUS >&2
JOBS=$(echo $HQ_CPUS | tr -cd ',' | wc -c)
else
JOBS=$(nproc)
fi
echo Running with $JOBS cpus >&2
JOBS_LOW=$((JOBS/2))
JOBS_LOW=$(($JOBS_LOW>1 ? $JOBS_LOW : 1))
echo Running with $JOBS_LOW cpus for high memory processes >&2

if [ -f $outfile.langids.zst ] && [ $(du $outfile.langids.zst |cut -f1) -gt 13 ]; then
echo "Task already completed, skipping..." >&2
exit 0
fi

echo "##### Read samples #####" >&2
zstdcat $inputfile \
| shuf -n20 \
| jq .text \
| zstdmt \
>$outfile.sample.zst

echo "##### Read documents #####" >&2
zstdcat $inputfile \
| parallel --pipe -j$JOBS_LOW --block 20M --halt now,fail=1 \
python3 /work/scripts/readdocuments.py $srclang --format $format \
| zstdmt -10 \
>$outfile.docproc.zst \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}

echo "#### Read registerlabels #####" >&2
has_registers=$(zstdcat $inputfile | tail -1 | jq '."web-register" != null')
echo "Has registers: $has_registers" >&2
if [ "$format" == "hplt-v3" ] && [ "$has_registers" == "true" ]; then
zstdcat $inputfile \
| python3 /work/scripts/reuse-registerlabels.py \
| zstdmt \
>$outfile.rl.zst \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}
fi

echo "##### Read corpus #####" >&2
zstdcat $outfile.docproc.zst \
| cut -f 7 \
| awk 'length() == 0{next;} {print;}' \
| zstdmt -10 \
> $outfile.tsv.zst

if [ "$srclang" = "bn" ] || [ "$srclang" = "ben" ]; then
source /work/venvs/venv-bnlp/bin/activate
fi
zstdcat $outfile.tsv.zst \
| parallel --pipe -j$JOBS_LOW --block 10M --halt now,fail=1 \
python3 /work/scripts/readcorpus_mono.py $srclang $srclang2 --quiet \
| zstdmt -10 \
>$outfile.proc.zst \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}

if [ "$srclang" = "bn" ] || [ "$srclang" = "ben" ]; then
deactivate
fi

echo "##### Monocleaner #####" >&2
source /work/venvs/venv-mc/bin/activate
zstdcat $outfile.tsv.zst \
| /work/preprocess/build/bin/cache -k 1 parallel -k -j $JOBS --block 10M --pipe --halt now,fail=1 \
monocleaner-hardrules \
--score_only --annotated_output \
--run_all_rules --disable_lang_ident \
$srclang - - \
| zstdmt \
> $outfile.hardrules.zst \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}
deactivate

echo "##### Fastspell #####" >&2
zstdcat $outfile.tsv.zst \
| parallel --pipe -j$JOBS --block 20M --halt now,fail=1 \
fastspell --aggr $srclang --quiet \
| cut -f2 \
| zstdmt \
>$outfile.langids.zst.tmp \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}
mv $outfile.langids.zst.tmp $outfile.langids.zst

rm $outfile.tsv.zst
Loading