From ed2fe1d1c54b84a62612e9c4832d73258c321a3a Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 18:35:13 -0600 Subject: [PATCH 1/6] feat: use pypdfium2 to split pdf instead of pypdf --- pyproject.toml | 1 + .../_hooks/custom/split_pdf_hook.py | 92 +++++++++++-------- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d2c879f..33848792 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "httpx >=0.27.0", "pydantic >=2.11.2", "pypdf >= 6.2.0", + "pypdfium2 >= 5.0.0", "requests-toolbelt >=1.0.0", ] diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 524e36cc..712c5573 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -13,11 +13,13 @@ from functools import partial from pathlib import Path from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO +from threading import Lock import aiofiles import httpx from httpx import AsyncClient from pypdf import PdfReader, PdfWriter +import pypdfium2 as pdfium from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -488,23 +490,30 @@ def _get_pdf_chunks_in_memory( The list of temporary file paths. """ - offset = page_start - 1 - offset_end = page_end or len(pdf.pages) + with pdfium.PdfDocument(pdf_bytes) as pdf: - chunk_no = 0 - while offset < offset_end: - chunk_no += 1 - new_pdf = PdfWriter() - chunk_buffer = io.BytesIO() + offset = page_start - 1 + offset_end = page_end if page_end else len(pdf) - end = min(offset + split_size, offset_end) + while offset < offset_end: + end = min(offset + split_size, offset_end) - for page in list(pdf.pages[offset:end]): - new_pdf.add_page(page) - new_pdf.write(chunk_buffer) - chunk_buffer.seek(0) - yield chunk_buffer, offset - offset += split_size + # Create new PDF + new_pdf = pdfium.PdfDocument.new() + + # Import pages + page_indices = list(range(offset, end)) + new_pdf.import_pages(pdf, pages=page_indices) + + # Save to buffer + chunk_buffer = io.BytesIO() + new_pdf.save(chunk_buffer) + chunk_buffer.seek(0) + + new_pdf.close() + + yield chunk_buffer, offset + offset += split_size def _get_pdf_chunk_paths( self, @@ -530,30 +539,39 @@ def _get_pdf_chunk_paths( The list of temporary file paths. """ - offset = page_start - 1 - offset_end = page_end or len(pdf.pages) + with pdfium.PdfDocument(pdf_bytes) as pdf: + offset = page_start - 1 + offset_end = page_end if page_end else len(pdf) - tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with - dir=self.cache_tmp_data_dir, - prefix="unstructured_client_" - ) - self.tempdirs[operation_id] = tempdir - tempdir_path = Path(tempdir.name) - pdf_chunk_paths: list[Tuple[Path, int]] = [] - chunk_no = 0 - while offset < offset_end: - chunk_no += 1 - new_pdf = PdfWriter() - - end = min(offset + split_size, offset_end) - - for page in list(pdf.pages[offset:end]): - new_pdf.add_page(page) - with open(tempdir_path / f"chunk_{chunk_no}.pdf", "wb") as pdf_chunk: - new_pdf.write(pdf_chunk) - pdf_chunk_paths.append((Path(pdf_chunk.name), offset)) - offset += split_size - return pdf_chunk_paths + # Create temporary directory + tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with + dir=self.cache_tmp_data_dir, + prefix="unstructured_client_" + ) + self.tempdirs[operation_id] = tempdir + tempdir_path = Path(tempdir.name) + + pdf_chunk_paths: list[Tuple[Path, int]] = [] + chunk_no = 0 + + while offset < offset_end: + chunk_no += 1 + end = min(offset + split_size, offset_end) + + # Create new PDF with selected pages + new_pdf = pdfium.PdfDocument.new() + page_indices = list(range(offset, end)) + new_pdf.import_pages(pdf, pages=page_indices) + + # Save to file + chunk_path = tempdir_path / f"chunk_{chunk_no}.pdf" + new_pdf.save(str(chunk_path)) # Convert Path to string + new_pdf.close() + + pdf_chunk_paths.append((chunk_path, offset)) + offset += split_size + + return pdf_chunk_paths def _get_pdf_chunk_files( self, pdf_chunks: list[Tuple[Path, int]] From 164eaee5b2f167b328dd91a8489b47a5c2cec8e4 Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 18:41:20 -0600 Subject: [PATCH 2/6] remove unused import --- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 712c5573..59cc8bce 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -13,7 +13,6 @@ from functools import partial from pathlib import Path from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO -from threading import Lock import aiofiles import httpx From 3ca2839b0335c047401e2e0cbee3b6be030a3615 Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 18:58:02 -0600 Subject: [PATCH 3/6] add deps --- poetry.lock | 38 +++++++++++++++++-- pyproject.toml | 2 +- .../_hooks/custom/split_pdf_hook.py | 12 +++--- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index 09d31f3d..772d6bae 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -959,7 +959,7 @@ files = [ ] [package.dependencies] -astroid = ">=3.2.2,<=3.3.0-dev0" +astroid = ">=3.2.2,<=3.3.0.dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ {version = ">=0.2", markers = "python_version < \"3.11\""}, @@ -1000,6 +1000,38 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow (>=8.0.0)", "cryptography"] image = ["Pillow (>=8.0.0)"] +[[package]] +name = "pypdfium2" +version = "5.3.0" +description = "Python bindings to PDFium" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "pypdfium2-5.3.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:885df6c78d41600cb086dc0c76b912d165b5bd6931ca08138329ea5a991b3540"}, + {file = "pypdfium2-5.3.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:6e53dee6b333ee77582499eff800300fb5aa0c7eb8f52f95ccb5ca35ebc86d48"}, + {file = "pypdfium2-5.3.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ce4466bdd62119fe25a5f74d107acc9db8652062bf217057630c6ff0bb419523"}, + {file = "pypdfium2-5.3.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:cc2647fd03db42b8a56a8835e8bc7899e604e2042cd6fedeea53483185612907"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35e205f537ddb4069e4b4e22af7ffe84fcf2d686c3fee5e5349f73268a0ef1ca"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5795298f44050797ac030994fc2525ea35d2d714efe70058e0ee22e5f613f27"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7cd43dfceb77137e69e74c933d41506da1dddaff70f3a794fb0ad0d73e90d75"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5956867558fd3a793e58691cf169718864610becb765bfe74dd83f05cbf1ae3"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ff1071e9a782625822658dfe6e29e3a644a66960f8713bb17819f5a0ac5987"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f319c46ead49d289ab8c1ed2ea63c91e684f35bdc4cf4dc52191c441182ac481"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6dc67a186da0962294321cace6ccc0a4d212dbc5e9522c640d35725a812324b8"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0ad0afd3d2b5b54d86287266fd6ae3fef0e0a1a3df9d2c4984b3e3f8f70e6330"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1afe35230dc3951b3e79b934c0c35a2e79e2372d06503fce6cf1926d2a816f47"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:00385793030cadce08469085cd21b168fd8ff981b009685fef3103bdc5fc4686"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:d911e82676398949697fef80b7f412078df14d725a91c10e383b727051530285"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:ca1dc625ed347fac3d9002a3ed33d521d5803409bd572e7b3f823c12ab2ef58f"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:ea4f9db2d3575f22cd41f4c7a855240ded842f135e59a961b5b1351a65ce2b6e"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0ea24409613df350223c6afc50911c99dca0d43ddaf2616c5a1ebdffa3e1bcb5"}, + {file = "pypdfium2-5.3.0-py3-none-win32.whl", hash = "sha256:5bf695d603f9eb8fdd7c1786add5cf420d57fbc81df142ed63c029ce29614df9"}, + {file = "pypdfium2-5.3.0-py3-none-win_amd64.whl", hash = "sha256:8365af22a39d4373c265f8e90e561cd64d4ddeaf5e6a66546a8caed216ab9574"}, + {file = "pypdfium2-5.3.0-py3-none-win_arm64.whl", hash = "sha256:0b2c6bf825e084d91d34456be54921da31e9199d9530b05435d69d1a80501a12"}, + {file = "pypdfium2-5.3.0.tar.gz", hash = "sha256:2873ffc95fcb01f329257ebc64a5fdce44b36447b6b171fe62f7db5dc3269885"}, +] + [[package]] name = "pytest" version = "8.4.2" @@ -1368,4 +1400,4 @@ test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", [metadata] lock-version = "2.1" python-versions = ">=3.9.2" -content-hash = "0d9a1d9d32b8e25de1671f5e7054443cca1565b8ad5c3e56027c69669ba6b3af" +content-hash = "0eb1752526d57b4ac4366653d6aa9b4c6c79f9b37f85a5fea9c48101b5e09b79" diff --git a/pyproject.toml b/pyproject.toml index 33848792..4ac72016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "unstructured-client" -version = "0.42.9" +version = "0.42.10" description = "Python Client SDK for Unstructured API" authors = [{ name = "Unstructured" },] readme = "README-PYPI.md" diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 59cc8bce..3caf7b99 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -18,7 +18,7 @@ import httpx from httpx import AsyncClient from pypdf import PdfReader, PdfWriter -import pypdfium2 as pdfium +import pypdfium2 as pdfium # type: ignore[import-untyped] from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -350,9 +350,11 @@ def before_request( pdf = self._trim_large_pages(pdf, form_data) + pdf_file.seek(0) + pdf_bytes = pdf_file.read() if self.cache_tmp_data_feature: pdf_chunk_paths = self._get_pdf_chunk_paths( - pdf, + pdf_bytes, operation_id=operation_id, split_size=split_size, page_start=page_range_start, @@ -363,7 +365,7 @@ def before_request( pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths) else: pdf_chunks = self._get_pdf_chunks_in_memory( - pdf, + pdf_bytes, split_size=split_size, page_start=page_range_start, page_end=page_range_end @@ -468,7 +470,7 @@ def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfRea def _get_pdf_chunks_in_memory( self, - pdf: PdfReader, + pdf_bytes: bytes, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None @@ -516,7 +518,7 @@ def _get_pdf_chunks_in_memory( def _get_pdf_chunk_paths( self, - pdf: PdfReader, + pdf_bytes: bytes, operation_id: str, split_size: int = 1, page_start: int = 1, From 5ad23d790f1f8fa499e69de91c3d11d28b84825d Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 19:28:41 -0600 Subject: [PATCH 4/6] update gen.yaml --- gen.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gen.yaml b/gen.yaml index 1f073287..4bc280dd 100644 --- a/gen.yaml +++ b/gen.yaml @@ -23,7 +23,7 @@ generation: schemas: allOfMergeStrategy: shallowMerge python: - version: 0.42.9 + version: 0.42.10 additionalDependencies: dev: deepdiff: '>=6.0' @@ -40,6 +40,7 @@ python: cryptography: '>=3.1' httpx: '>=0.27.0' pypdf: '>= 6.2.0' + pypdfium: '>= 5.0.0' requests-toolbelt: '>=1.0.0' allowedRedefinedBuiltins: - id From 8cd5af882cd854298d33d0e76493ea356efaccba Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 20:24:48 -0600 Subject: [PATCH 5/6] check type before read --- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 3caf7b99..8be32895 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -350,8 +350,12 @@ def before_request( pdf = self._trim_large_pages(pdf, form_data) - pdf_file.seek(0) - pdf_bytes = pdf_file.read() + if isinstance(pdf_file, bytes): + pdf_bytes = pdf_file + else: + pdf_file.seek(0) + pdf_bytes = pdf_file.read() + if self.cache_tmp_data_feature: pdf_chunk_paths = self._get_pdf_chunk_paths( pdf_bytes, From 0cdb3383e9ba4c4e7ef60d23defdcf6a9d3def0c Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Feb 2026 21:48:04 -0600 Subject: [PATCH 6/6] use trimmed pdf --- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 8be32895..605a1f54 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -350,11 +350,8 @@ def before_request( pdf = self._trim_large_pages(pdf, form_data) - if isinstance(pdf_file, bytes): - pdf_bytes = pdf_file - else: - pdf_file.seek(0) - pdf_bytes = pdf_file.read() + pdf.stream.seek(0) + pdf_bytes = pdf.stream.read() if self.cache_tmp_data_feature: pdf_chunk_paths = self._get_pdf_chunk_paths(