dataforgoodfr · hugros-93 · Mar 3, 2026 · Feb 16, 2026 · Feb 17, 2026 · Feb 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -165,3 +165,6 @@ s3/
 
 # Parsing generated artifacts
 ingestion/parsing/output/extracted_texts/
+eu_fact_force/exploration/docling/results/html/
+eu_fact_force/exploration/docling/results/json/
+eu_fact_force/exploration/docling/results/md/
diff --git a/eu_fact_force/exploration/docling/docling_experiment.py b/eu_fact_force/exploration/docling/docling_experiment.py
@@ -0,0 +1,116 @@
+import json
+import time
+from pathlib import Path
+from tqdm import tqdm
+from docling.document_converter import DocumentConverter
+from hierarchical.postprocessor import ResultPostprocessor
+from PyPDF2 import PdfReader
+
+
+def get_doc_list(doc_path):
+    list_files = []
+    for file in doc_path.iterdir():
+        if file.is_file():
+            list_files.append(file.name)
+    return list_files
+
+
+def run_mini_benchmark_docling():
+    # Get list of files
+    doc_path = Path("docs/")
+    list_files = get_doc_list(doc_path)
+    print(f"> number of files: {len(list_files)}")
+
+    # Define Docling converter
+    converter = DocumentConverter()
+
+    # Run experiment
+    print("### Starting - docling benchmark ###")
+    experiment_results = {}
+    for filename in tqdm(list_files):
+        # Select file
+        print(f"> selected file: {filename}")
+        file_path = Path(filename)
+        doc_name = file_path.stem
+
+        # Convert
+        start_time = time.time()
+        result = converter.convert(doc_path / file_path)
+        ResultPostprocessor(result).process()
+        total_time = time.time() - start_time
+
+        # Save results
+        experiment_results[filename] = {
+            "total_time": total_time,
+            "total_pages": len(result.pages),
+            "time_per_page": total_time / len(result.pages),
+            "total_chars": len(result.document.export_to_text()),
+        }
+        with open("results/mini_benchmark_results_docling.json", "w") as f:
+            json.dump(experiment_results, f)
+
+        # Export files
+        doc_markdwon = result.document.export_to_markdown()
+        with open(f"results/md/{doc_name}_docling.md", "w", encoding="utf-8") as f:
+            f.write(doc_markdwon)
+
+        doc_html = result.document.export_to_html()
+        with open(f"results/html/{doc_name}_docling.html", "w", encoding="utf-8") as f:
+            f.write(doc_html)
+
+        doc_json = result.document.export_to_dict()
+        with open(f"results/json/{doc_name}_docling.json", "w", encoding="utf-8") as f:
+            json.dump(doc_json, f, ensure_ascii=False)
+
+
+def run_mini_benchmark_pypdf2():
+    # Get list of files
+    doc_path = Path("docs/")
+    list_files = get_doc_list(doc_path)
+    print(f"> number of files: {len(list_files)}")
+
+    # Run experiment
+    print("### Starting - pypdf2 benchmark ###")
+    experiment_results = {}
+    for filename in tqdm(list_files):
+        # Select file
+        print(f"> selected file: {filename}")
+        file_path = Path(filename)
+        doc_name = file_path.stem
+
+        # Convert
+        start_time = time.time()
+        result = PdfReader(doc_path / file_path)
+        total_time = time.time() - start_time
+
+        # Get full text
+        full_text = []
+        for page in result.pages:
+            text = page.extract_text()
+            if text:
+                full_text.append(text)
+        full_text = "\n".join(full_text)
+
+        # Save results
+        experiment_results[filename] = {
+            "total_time": total_time,
+            "total_pages": len(result.pages),
+            "time_per_page": total_time / len(result.pages),
+            "total_chars": len(full_text),
+        }
+        with open("results/mini_benchmark_results_pypdf2.json", "w") as f:
+            json.dump(experiment_results, f)
+
+        # Export files
+        pages = []
+        for i, page in enumerate(result.pages, start=1):
+            pages.append({"page": i, "text": page.extract_text() or ""})
+
+        doc_json = {"num_pages": len(result.pages), "pages": pages}
+        with open(f"results/json/{doc_name}_pypdf2.json", "w", encoding="utf-8") as f:
+            json.dump(doc_json, f, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    run_mini_benchmark_docling()
+    run_mini_benchmark_pypdf2()
diff --git a/eu_fact_force/exploration/docling/docs/1-s2.0-S2352250X23001574-main.pdf b/eu_fact_force/exploration/docling/docs/1-s2.0-S2352250X23001574-main.pdf
diff --git a/eu_fact_force/exploration/docling/docs/40359_2023_Article_1210.pdf b/eu_fact_force/exploration/docling/docs/40359_2023_Article_1210.pdf
diff --git a/eu_fact_force/exploration/docling/docs/jhab032.pdf b/eu_fact_force/exploration/docling/docs/jhab032.pdf