diff --git a/gen.yaml b/gen.yaml index 1f073287..4bc280dd 100644 --- a/gen.yaml +++ b/gen.yaml @@ -23,7 +23,7 @@ generation: schemas: allOfMergeStrategy: shallowMerge python: - version: 0.42.9 + version: 0.42.10 additionalDependencies: dev: deepdiff: '>=6.0' @@ -40,6 +40,7 @@ python: cryptography: '>=3.1' httpx: '>=0.27.0' pypdf: '>= 6.2.0' + pypdfium: '>= 5.0.0' requests-toolbelt: '>=1.0.0' allowedRedefinedBuiltins: - id diff --git a/poetry.lock b/poetry.lock index 09d31f3d..772d6bae 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -959,7 +959,7 @@ files = [ ] [package.dependencies] -astroid = ">=3.2.2,<=3.3.0-dev0" +astroid = ">=3.2.2,<=3.3.0.dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ {version = ">=0.2", markers = "python_version < \"3.11\""}, @@ -1000,6 +1000,38 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow (>=8.0.0)", "cryptography"] image = ["Pillow (>=8.0.0)"] +[[package]] +name = "pypdfium2" +version = "5.3.0" +description = "Python bindings to PDFium" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "pypdfium2-5.3.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:885df6c78d41600cb086dc0c76b912d165b5bd6931ca08138329ea5a991b3540"}, + {file = "pypdfium2-5.3.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:6e53dee6b333ee77582499eff800300fb5aa0c7eb8f52f95ccb5ca35ebc86d48"}, + {file = "pypdfium2-5.3.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ce4466bdd62119fe25a5f74d107acc9db8652062bf217057630c6ff0bb419523"}, + {file = "pypdfium2-5.3.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:cc2647fd03db42b8a56a8835e8bc7899e604e2042cd6fedeea53483185612907"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35e205f537ddb4069e4b4e22af7ffe84fcf2d686c3fee5e5349f73268a0ef1ca"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5795298f44050797ac030994fc2525ea35d2d714efe70058e0ee22e5f613f27"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7cd43dfceb77137e69e74c933d41506da1dddaff70f3a794fb0ad0d73e90d75"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5956867558fd3a793e58691cf169718864610becb765bfe74dd83f05cbf1ae3"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ff1071e9a782625822658dfe6e29e3a644a66960f8713bb17819f5a0ac5987"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f319c46ead49d289ab8c1ed2ea63c91e684f35bdc4cf4dc52191c441182ac481"}, + {file = "pypdfium2-5.3.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6dc67a186da0962294321cace6ccc0a4d212dbc5e9522c640d35725a812324b8"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0ad0afd3d2b5b54d86287266fd6ae3fef0e0a1a3df9d2c4984b3e3f8f70e6330"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1afe35230dc3951b3e79b934c0c35a2e79e2372d06503fce6cf1926d2a816f47"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:00385793030cadce08469085cd21b168fd8ff981b009685fef3103bdc5fc4686"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:d911e82676398949697fef80b7f412078df14d725a91c10e383b727051530285"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:ca1dc625ed347fac3d9002a3ed33d521d5803409bd572e7b3f823c12ab2ef58f"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:ea4f9db2d3575f22cd41f4c7a855240ded842f135e59a961b5b1351a65ce2b6e"}, + {file = "pypdfium2-5.3.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0ea24409613df350223c6afc50911c99dca0d43ddaf2616c5a1ebdffa3e1bcb5"}, + {file = "pypdfium2-5.3.0-py3-none-win32.whl", hash = "sha256:5bf695d603f9eb8fdd7c1786add5cf420d57fbc81df142ed63c029ce29614df9"}, + {file = "pypdfium2-5.3.0-py3-none-win_amd64.whl", hash = "sha256:8365af22a39d4373c265f8e90e561cd64d4ddeaf5e6a66546a8caed216ab9574"}, + {file = "pypdfium2-5.3.0-py3-none-win_arm64.whl", hash = "sha256:0b2c6bf825e084d91d34456be54921da31e9199d9530b05435d69d1a80501a12"}, + {file = "pypdfium2-5.3.0.tar.gz", hash = "sha256:2873ffc95fcb01f329257ebc64a5fdce44b36447b6b171fe62f7db5dc3269885"}, +] + [[package]] name = "pytest" version = "8.4.2" @@ -1368,4 +1400,4 @@ test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", [metadata] lock-version = "2.1" python-versions = ">=3.9.2" -content-hash = "0d9a1d9d32b8e25de1671f5e7054443cca1565b8ad5c3e56027c69669ba6b3af" +content-hash = "0eb1752526d57b4ac4366653d6aa9b4c6c79f9b37f85a5fea9c48101b5e09b79" diff --git a/pyproject.toml b/pyproject.toml index 8d2c879f..4ac72016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "unstructured-client" -version = "0.42.9" +version = "0.42.10" description = "Python Client SDK for Unstructured API" authors = [{ name = "Unstructured" },] readme = "README-PYPI.md" @@ -13,6 +13,7 @@ dependencies = [ "httpx >=0.27.0", "pydantic >=2.11.2", "pypdf >= 6.2.0", + "pypdfium2 >= 5.0.0", "requests-toolbelt >=1.0.0", ] diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 524e36cc..605a1f54 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -18,6 +18,7 @@ import httpx from httpx import AsyncClient from pypdf import PdfReader, PdfWriter +import pypdfium2 as pdfium # type: ignore[import-untyped] from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -349,9 +350,12 @@ def before_request( pdf = self._trim_large_pages(pdf, form_data) + pdf.stream.seek(0) + pdf_bytes = pdf.stream.read() + if self.cache_tmp_data_feature: pdf_chunk_paths = self._get_pdf_chunk_paths( - pdf, + pdf_bytes, operation_id=operation_id, split_size=split_size, page_start=page_range_start, @@ -362,7 +366,7 @@ def before_request( pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths) else: pdf_chunks = self._get_pdf_chunks_in_memory( - pdf, + pdf_bytes, split_size=split_size, page_start=page_range_start, page_end=page_range_end @@ -467,7 +471,7 @@ def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfRea def _get_pdf_chunks_in_memory( self, - pdf: PdfReader, + pdf_bytes: bytes, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None @@ -488,27 +492,34 @@ def _get_pdf_chunks_in_memory( The list of temporary file paths. """ - offset = page_start - 1 - offset_end = page_end or len(pdf.pages) + with pdfium.PdfDocument(pdf_bytes) as pdf: + + offset = page_start - 1 + offset_end = page_end if page_end else len(pdf) + + while offset < offset_end: + end = min(offset + split_size, offset_end) + + # Create new PDF + new_pdf = pdfium.PdfDocument.new() - chunk_no = 0 - while offset < offset_end: - chunk_no += 1 - new_pdf = PdfWriter() - chunk_buffer = io.BytesIO() + # Import pages + page_indices = list(range(offset, end)) + new_pdf.import_pages(pdf, pages=page_indices) - end = min(offset + split_size, offset_end) + # Save to buffer + chunk_buffer = io.BytesIO() + new_pdf.save(chunk_buffer) + chunk_buffer.seek(0) - for page in list(pdf.pages[offset:end]): - new_pdf.add_page(page) - new_pdf.write(chunk_buffer) - chunk_buffer.seek(0) - yield chunk_buffer, offset - offset += split_size + new_pdf.close() + + yield chunk_buffer, offset + offset += split_size def _get_pdf_chunk_paths( self, - pdf: PdfReader, + pdf_bytes: bytes, operation_id: str, split_size: int = 1, page_start: int = 1, @@ -530,30 +541,39 @@ def _get_pdf_chunk_paths( The list of temporary file paths. """ - offset = page_start - 1 - offset_end = page_end or len(pdf.pages) + with pdfium.PdfDocument(pdf_bytes) as pdf: + offset = page_start - 1 + offset_end = page_end if page_end else len(pdf) - tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with - dir=self.cache_tmp_data_dir, - prefix="unstructured_client_" - ) - self.tempdirs[operation_id] = tempdir - tempdir_path = Path(tempdir.name) - pdf_chunk_paths: list[Tuple[Path, int]] = [] - chunk_no = 0 - while offset < offset_end: - chunk_no += 1 - new_pdf = PdfWriter() - - end = min(offset + split_size, offset_end) - - for page in list(pdf.pages[offset:end]): - new_pdf.add_page(page) - with open(tempdir_path / f"chunk_{chunk_no}.pdf", "wb") as pdf_chunk: - new_pdf.write(pdf_chunk) - pdf_chunk_paths.append((Path(pdf_chunk.name), offset)) - offset += split_size - return pdf_chunk_paths + # Create temporary directory + tempdir = tempfile.TemporaryDirectory( # pylint: disable=consider-using-with + dir=self.cache_tmp_data_dir, + prefix="unstructured_client_" + ) + self.tempdirs[operation_id] = tempdir + tempdir_path = Path(tempdir.name) + + pdf_chunk_paths: list[Tuple[Path, int]] = [] + chunk_no = 0 + + while offset < offset_end: + chunk_no += 1 + end = min(offset + split_size, offset_end) + + # Create new PDF with selected pages + new_pdf = pdfium.PdfDocument.new() + page_indices = list(range(offset, end)) + new_pdf.import_pages(pdf, pages=page_indices) + + # Save to file + chunk_path = tempdir_path / f"chunk_{chunk_no}.pdf" + new_pdf.save(str(chunk_path)) # Convert Path to string + new_pdf.close() + + pdf_chunk_paths.append((chunk_path, offset)) + offset += split_size + + return pdf_chunk_paths def _get_pdf_chunk_files( self, pdf_chunks: list[Tuple[Path, int]]