From 8fb7a53f0e2d0fa85ccdc6dd7ccda44dec6ee0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 12 Jan 2026 12:05:47 +0100 Subject: [PATCH 01/36] GH-48827: [CI][Python] Add required xz dependency to emscripten dockerfile (#48828) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The emscripten job has been failing on the nightlies jobs ### What changes are included in this PR? Install dependencies slightly earlier on the Dockerfile and add xz which is required on `install_emscripten.sh` now. ### Are these changes tested? Yes via archery. ### Are there any user-facing changes? No * GitHub Issue: #48827 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ci/docker/conda-python-emscripten.dockerfile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/docker/conda-python-emscripten.dockerfile b/ci/docker/conda-python-emscripten.dockerfile index 47ff550cd59..878f918710f 100644 --- a/ci/docker/conda-python-emscripten.dockerfile +++ b/ci/docker/conda-python-emscripten.dockerfile @@ -39,6 +39,11 @@ RUN python -m pip install --no-cache-dir selenium==${selenium_version} && \ RUN pyodide_dist_url="https://github.com/pyodide/pyodide/releases/download/${pyodide_version}/pyodide-${pyodide_version}.tar.bz2" && \ wget -q "${pyodide_dist_url}" -O- | tar -xj -C / +# install node 20 (needed for async call support) +# and pthread-stubs for build, and unzip needed for chrome build to work +# xz is needed by emsdk to extract node tarballs +RUN conda install nodejs=20 unzip pthread-stubs make xz -c conda-forge + # install correct version of emscripten for this pyodide COPY ci/scripts/install_emscripten.sh /arrow/ci/scripts/ RUN bash /arrow/ci/scripts/install_emscripten.sh ~ /pyodide @@ -46,10 +51,6 @@ RUN bash /arrow/ci/scripts/install_emscripten.sh ~ /pyodide # make sure zlib is cached in the EMSDK folder RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib -# install node 20 (needed for async call support) -# and pthread-stubs for build, and unzip needed for chrome build to work -RUN conda install nodejs=20 unzip pthread-stubs make -c conda-forge - # install chrome for testing browser based runner COPY ci/scripts/install_chromedriver.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_chromedriver.sh "${chrome_version}" From b6362d09204136dbf2167fee8ec8ec2af0efe42d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 12 Jan 2026 12:12:07 +0100 Subject: [PATCH 02/36] GH-48582: [CI][GPU][C++][Python] Add new CUDA jobs using the new self-hosted runners (#48583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The CUDA jobs stopped working when Voltron Data infrastructure went down. We have set up with ASF Infra a [runs-on](https://runs-on.com/runners/gpu/) solution to run CUDA runners. ### What changes are included in this PR? Add the new workflow for `cuda_extra.yml` with CI jobs that use the runs-on CUDA runners. Due to the underlying instances having CUDA 12.9 the jobs to be run are: - AMD64 Ubuntu 22 CUDA 11.7.1 - AMD64 Ubuntu 24 CUDA 12.9.0 - AMD64 Ubuntu 22 CUDA 11.7.1 Python - AMD64 Ubuntu 24 CUDA 12.9.0 Python A follow up issue has been created to add jobs for CUDA 13, see: https://github.com/apache/arrow/issues/48783 A new label `CI: Extra: CUDA` has also been created. ### Are these changes tested? Yes via CI ### Are there any user-facing changes? No * GitHub Issue: #48582 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .github/workflows/cuda_extra.yml | 136 +++++++++++++++++++++++++ dev/tasks/docker-tests/github.cuda.yml | 52 ---------- dev/tasks/tasks.yml | 26 ----- 3 files changed, 136 insertions(+), 78 deletions(-) create mode 100644 .github/workflows/cuda_extra.yml delete mode 100644 dev/tasks/docker-tests/github.cuda.yml diff --git a/.github/workflows/cuda_extra.yml b/.github/workflows/cuda_extra.yml new file mode 100644 index 00000000000..1700d6a8456 --- /dev/null +++ b/.github/workflows/cuda_extra.yml @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: CUDA Extra + +on: + push: + tags: + - '**' + pull_request: + types: + - labeled + - opened + - reopened + - synchronize + schedule: + - cron: | + 0 6 * * * + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + actions: read + contents: read + pull-requests: read + +jobs: + check-labels: + if: github.event_name != 'schedule' || github.repository == 'apache/arrow' + uses: ./.github/workflows/check_labels.yml + secrets: inherit + with: + parent-workflow: cuda_extra + + docker: + needs: check-labels + name: ${{ matrix.title }} + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=capacity-optimized" + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: CUDA') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + include: + - cuda: 12.9.0 + ubuntu: 24.04 + image: ubuntu-cuda-cpp + title: AMD64 Ubuntu 24 CUDA 12.9.0 + - cuda: 11.7.1 + ubuntu: 22.04 + image: ubuntu-cuda-cpp + title: AMD64 Ubuntu 22 CUDA 11.7.1 + - cuda: 12.9.0 + ubuntu: 24.04 + image: ubuntu-cuda-python + title: AMD64 Ubuntu 24 CUDA 12.9.0 Python + - cuda: 11.7.1 + ubuntu: 22.04 + image: ubuntu-cuda-python + title: AMD64 Ubuntu 22 CUDA 11.7.1 Python + env: + ARCHERY_DEBUG: 1 + ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" + steps: + - name: Checkout Arrow + uses: actions/checkout@v6 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@v5 + with: + path: .docker + key: extra-${{ matrix.image }}-${{ hashFiles('cpp/**') }} + restore-keys: extra-${{ matrix.image }}- + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[docker] + - name: Display NVIDIA SMI details + run: | + nvidia-smi + nvidia-smi -L + nvidia-smi -q -d Memory + - name: Execute Docker Build + continue-on-error: ${{ matrix.continue-on-error || false }} + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + CUDA: ${{ matrix.cuda }} + UBUNTU: ${{ matrix.ubuntu }} + run: | + # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes + sudo sysctl -w vm.mmap_rnd_bits=28 + source ci/scripts/util_enable_core_dumps.sh + archery docker run ${{ matrix.run-options || '' }} ${{ matrix.image }} + - name: Docker Push + if: >- + success() && + github.event_name == 'push' && + github.repository == 'apache/arrow' && + github.ref_name == 'main' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + run: archery docker push ${{ matrix.image }} + + report-extra-cpp: + if: github.event_name == 'schedule' && always() + needs: + - docker + uses: ./.github/workflows/report_ci.yml + secrets: inherit diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml deleted file mode 100644 index e65ac457b2e..00000000000 --- a/dev/tasks/docker-tests/github.cuda.yml +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - test: - name: | - Docker Test {{ flags|default("") }} {{ image }} {{ command|default("") }} - runs-on: ['self-hosted', 'cuda'] -{{ macros.github_set_env(env) }} - timeout-minutes: {{ timeout|default(60) }} - steps: - {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }} - # python 3.10 is installed on the runner, no need to install - - name: Install pip - run: sudo apt update && sudo apt install python3-pip -y - - name: Install archery - run: python3 -m pip install -e arrow/dev/archery[docker] - - name: Execute Docker Build - shell: bash - env: - {{ macros.github_set_sccache_envvars()|indent(8) }} - run: | - source arrow/ci/scripts/util_enable_core_dumps.sh - archery docker run \ - -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ - {{ flags|default("") }} \ - {{ image }} \ - {{ command|default("") }} - {% if arrow.is_default_branch() %} - {{ macros.github_login_dockerhub()|indent }} - - name: Push Docker Image - shell: bash - run: archery docker push {{ image }} - {% endif %} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 266073daff6..2667aa1fb5e 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -41,9 +41,6 @@ groups: {############################# Testing tasks #################################} - cuda: - - test-cuda-* - test: - test-* @@ -762,35 +759,12 @@ tasks: artifacts: - docs.tar.gz - ############################## CUDA tests ################################# - -{% for ubuntu, cuda in [("22.04", "11.7.1"), ("24.04", "13.0.2")] %} - test-cuda-cpp-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}: - ci: github - template: docker-tests/github.cuda.yml - params: - env: - CUDA: {{ cuda }} - UBUNTU: {{ ubuntu }} - image: ubuntu-cuda-cpp - - test-cuda-python-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}: - ci: github - template: docker-tests/github.cuda.yml - params: - env: - CUDA: {{ cuda }} - UBUNTU: {{ ubuntu }} - image: ubuntu-cuda-python -{% endfor %} - ############################## Fuzz tests ################################# test-build-cpp-fuzz: ci: github template: fuzz-tests/github.oss-fuzz.yml - ############################## vcpkg tests ################################## test-build-vcpkg-win: From 86d28e9d55f8d0f11634b4a2a19233fa843d9261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 13 Jan 2026 13:01:18 +0100 Subject: [PATCH 03/36] GH-48838: [Release] Use gh cli to download sources for Linux packages and publish draft release before verification (#48839) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With the change we did for immutable releases we required draft releases to be able to keep uploading artifacts during the release process. This means that the interim URL to download assets isn't the expected one on some of our scripts. ### What changes are included in this PR? Update the `download_rc_archive` task so we use the GitHub cli tool instead of manually building the download URL for the source tar.gz from the release. Update order of release scripts to publish the release before running verification tasks so the URL is the final one. ### Are these changes tested? I have manually tested both the `gh release download` script and that the final URL will be the expected one once we move from draft to published release. I've tested creating a new release on my own fork here: https://github.com/raulcd/arrow/releases/tag/test-release-rc2 ### Are there any user-facing changes? No * GitHub Issue: #48838 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .pre-commit-config.yaml | 2 +- ...ublish-gh-release.sh => 07-publish-gh-release.sh} | 0 .../{07-binary-verify.sh => 08-binary-verify.sh} | 0 dev/tasks/linux-packages/apache-arrow/Rakefile | 12 ++++++++---- docs/source/developers/release.rst | 12 ++++++++---- 5 files changed, 17 insertions(+), 9 deletions(-) rename dev/release/{08-publish-gh-release.sh => 07-publish-gh-release.sh} (100%) rename dev/release/{07-binary-verify.sh => 08-binary-verify.sh} (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 186277edf40..da84abed0d9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -353,7 +353,7 @@ repos: ?^cpp/examples/minimal_build/run\.sh$| ?^cpp/examples/tutorial_examples/run\.sh$| ?^dev/release/05-binary-upload\.sh$| - ?^dev/release/07-binary-verify\.sh$| + ?^dev/release/08-binary-verify\.sh$| ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| ?^dev/release/post-08-docs\.sh$| diff --git a/dev/release/08-publish-gh-release.sh b/dev/release/07-publish-gh-release.sh similarity index 100% rename from dev/release/08-publish-gh-release.sh rename to dev/release/07-publish-gh-release.sh diff --git a/dev/release/07-binary-verify.sh b/dev/release/08-binary-verify.sh similarity index 100% rename from dev/release/07-binary-verify.sh rename to dev/release/08-binary-verify.sh diff --git a/dev/tasks/linux-packages/apache-arrow/Rakefile b/dev/tasks/linux-packages/apache-arrow/Rakefile index 7644d2d23fb..cdb77108452 100644 --- a/dev/tasks/linux-packages/apache-arrow/Rakefile +++ b/dev/tasks/linux-packages/apache-arrow/Rakefile @@ -59,11 +59,15 @@ class ApacheArrowPackageTask < PackageTask end def download_rc_archive - base_url = "https://github.com/#{github_repository}" - base_url += "/releases/download/apache-arrow-#{@version}" archive_name_no_rc = @archive_name.gsub(/-rc\d+(\.tar\.gz)\z/, "\\1") - url = "#{base_url}/#{archive_name_no_rc}" - download(url, @archive_name) + sh("gh", + "release", + "download", + "apache-arrow-#{@version}", + "--clobber", + "--repo", github_repository, + "--pattern", archive_name_no_rc) + mv(archive_name_no_rc, @archive_name) end def download_released_archive diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index c5efc5f30fc..0ec81c1e6c8 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -246,7 +246,8 @@ Build source and binaries and submit them archery crossbow status # Download the produced binaries - # This will download packages to a directory called packages/release--rc + # This will download packages generated from the archery tasks + # to a directory called packages/release--rc dev/release/04-binary-download.sh # Sign and upload the binaries @@ -263,11 +264,14 @@ Build source and binaries and submit them # NOTE: You need to have GitHub CLI installed to run this script. dev/release/06-matlab-upload.sh + # Move the Release Candidate GitHub Release from draft to published state + # This will update the artifacts download URL which will be available for the + # verification step. + dev/release/07-publish-gh-release.sh + # Start verifications for binaries and wheels - dev/release/07-binary-verify.sh + dev/release/08-binary-verify.sh - # Move the Release Candidate GitHub Release from draft to published state - dev/release/08-publish-gh-release.sh Verify the Release ------------------ From e33512d1d82c28e753004d0d7a76c1dca542b1cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 13 Jan 2026 14:03:11 +0100 Subject: [PATCH 04/36] GH-48841: [Release][Package] Add GH_TOKEN to rake build step on Linux Packaging jobs (#48842) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With: - https://github.com/apache/arrow/pull/48839 We use `gh release download`. This requires the GH_TOKEN available. ### What changes are included in this PR? Add env with `GH_TOKEN`. I've validate the Rake's `sh` should inherit the environment variables that are defined on your shell. ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #48841 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .github/workflows/package_linux.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml index 3e4b7592153..c59784d7f58 100644 --- a/.github/workflows/package_linux.yml +++ b/.github/workflows/package_linux.yml @@ -230,6 +230,8 @@ jobs: ${GITHUB_REF_NAME} \ release_candidate.yml - name: Build + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | pushd dev/tasks/linux-packages rake docker:pull || : From 365e5aca5a1ae2cdc31e6bd4ed640bfba00d5cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 13 Jan 2026 14:05:23 +0100 Subject: [PATCH 05/36] MINOR: [Release] Update CHANGELOG.md for 23.0.0 --- CHANGELOG.md | 352 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6101f5d3cac..7bd105ebc59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,356 @@ +# Apache Arrow 23.0.0 (2026-01-12 00:00:00+00:00) + +## Bug Fixes + +* [GH-33473](https://github.com/apache/arrow/issues/33473) - [Python] Fix KeyError on Pandas roundtrip with RangeIndex in MultiIndex (#39983) +* [GH-35957](https://github.com/apache/arrow/issues/35957) - [C++][Compute] Graceful error for decimal binary arithmetic and comparison instead of firing confusing assertion (#48639) +* [GH-41246](https://github.com/apache/arrow/issues/41246) - [C++][Python] Simplify nested field encryption configuration (#45462) +* [GH-42173](https://github.com/apache/arrow/issues/42173) - [R][C++] Writing partitioned dataset on S3 fails if ListBucket is not allowed for the user (#47599) +* [GH-43660](https://github.com/apache/arrow/issues/43660) - [C++][Compute] Avoid ZeroCopyCastExec when casting Binary offset -> Binary offset types (#48171) +* [GH-44318](https://github.com/apache/arrow/issues/44318) - [C++][Python] Fix RecordBatch::FromStructArray for sliced arrays with offset = 0 (#47843) +* [GH-45260](https://github.com/apache/arrow/issues/45260) - [R][Docs] Improve documentation on GCS support +* [GH-45867](https://github.com/apache/arrow/issues/45867) - [Python] Fix `SetuptoolsDeprecationWarning` (#47141) +* [GH-46063](https://github.com/apache/arrow/issues/46063) - [C++][Compute] Fix the issue that MinMax kernel emits -inf/inf for all-NaN input (#48459) +* [GH-46584](https://github.com/apache/arrow/issues/46584) - [C++][FlightRPC] Iterate over endpoints in ODBC driver (#47991) +* [GH-47000](https://github.com/apache/arrow/issues/47000) - [R] concat_tables on a record_batch causes segfault (#47885) +* [GH-47022](https://github.com/apache/arrow/issues/47022) - [Python] Support unsigned dictionary indices in pandas conversion (#48451) +* [GH-47099](https://github.com/apache/arrow/issues/47099) - [C++][Parquet] Add missing `pragma warning(pop)` to `parquet/platform.h` (#47114) +* [GH-47371](https://github.com/apache/arrow/issues/47371) - , GH-48281: [Python][CI] Fix Numba-CUDA interop (#48284) +* [GH-47559](https://github.com/apache/arrow/issues/47559) - [Python] Fix missing argument in pyarrow fs (#47497) +* [GH-47564](https://github.com/apache/arrow/issues/47564) - [C++] Update expected L2 CPU cache range to 32KiB-64MiB (#47563) +* [GH-47664](https://github.com/apache/arrow/issues/47664) - [C++][Parquet] add num_rows_ before each call to RowGroupWriter::Close in FileSerializer (#47665) +* [GH-47734](https://github.com/apache/arrow/issues/47734) - [Python] Fix hypothesis timedelta bounds for duration/interval types (#48460) +* [GH-47751](https://github.com/apache/arrow/issues/47751) - [CI] Fix check for job to ignore on reporting (#47755) +* [GH-47778](https://github.com/apache/arrow/issues/47778) - [CI][Python] Remove ORC alias timezone for US/Pacific on test_orc.py::test_timezone_absent (#47956) +* [GH-47781](https://github.com/apache/arrow/issues/47781) - [C++] Cleaned up type-limit warning in sink_node.cc (#47782) +* [GH-47807](https://github.com/apache/arrow/issues/47807) - [C++][Compute] Fix the issue that null count is not updated when setting slice on an array span (#47808) +* [GH-47812](https://github.com/apache/arrow/issues/47812) - [R][CI] Fix lint for new version of styler (#47813) +* [GH-47821](https://github.com/apache/arrow/issues/47821) - [CI][Release][R] Fix test repository path in release (#47929) +* [GH-47823](https://github.com/apache/arrow/issues/47823) - [Python] Use PyWeakref_GetRef instead of PyWeakref_GET_OBJECT (Python 3.15) (#48027) +* [GH-47825](https://github.com/apache/arrow/issues/47825) - [C++] Fix the issue that bitmap ops overriding partial leading byte (#47912) +* [GH-47830](https://github.com/apache/arrow/issues/47830) - [Release] Run RC verification source testing step in a subshell (#47831) +* [GH-47836](https://github.com/apache/arrow/issues/47836) - [C++] Fix Meson configuration after bpacking changes (#47837) +* [GH-47840](https://github.com/apache/arrow/issues/47840) - [CI][C++] Check whether the CSV module/thread sanitizer is enabled or not before building example (#47841) +* [GH-47844](https://github.com/apache/arrow/issues/47844) - [CI] Fix unconditionally running extra workflows reporting when there are jobs failing (#47917) +* [GH-47859](https://github.com/apache/arrow/issues/47859) - [C++] Fix creating union types without type_codes for fields.size() == 128 (#47815) +* [GH-47861](https://github.com/apache/arrow/issues/47861) - [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860) +* [GH-47883](https://github.com/apache/arrow/issues/47883) - [CI] Add openssl gem explicitly to fix ceriticate validation error on test (#47884) +* [GH-47909](https://github.com/apache/arrow/issues/47909) - [C++] Fix MSVC ARM64 build (#47910) +* [GH-47914](https://github.com/apache/arrow/issues/47914) - [C++] Fix system Apache ORC/Google logging used detection (#47915) +* [GH-47918](https://github.com/apache/arrow/issues/47918) - [Format] Clarify that empty compressed buffers can omit the length header (#48541) +* [GH-47919](https://github.com/apache/arrow/issues/47919) - [C++] Update Meson config for C Data Interface changes (#47920) +* [GH-47921](https://github.com/apache/arrow/issues/47921) - [C++] Implement substrait option in Meson (#48016) +* [GH-47923](https://github.com/apache/arrow/issues/47923) - [CI] Use macos-15-intel instead of macos-13 for macOS x86 runner (#47690) +* [GH-47924](https://github.com/apache/arrow/issues/47924) - [C++] Fix issues in CSV reader with invalid inputs (#47925) +* [GH-47927](https://github.com/apache/arrow/issues/47927) - [Release] Fix APT repository metadata generation with new repository (#47928) +* [GH-47932](https://github.com/apache/arrow/issues/47932) - [Release][Python] PyPI rejects our source distribution due to missing LICENSE.txt +* [GH-47933](https://github.com/apache/arrow/issues/47933) - [Release][R] Don't upload *.sha512.{asc,sha512} (#47982) +* [GH-47941](https://github.com/apache/arrow/issues/47941) - [R] Fix codegen.R error from dplyr pipe to base pipe change (#47985) +* [GH-47942](https://github.com/apache/arrow/issues/47942) - [R] CRAN 22.0.0 R package release fails on Winbuilder due to "non-API call to R: 'Rf_lazy_duplicate'" (#47943) +* [GH-47945](https://github.com/apache/arrow/issues/47945) - [C++] Add support for Boost 1.89.0 and require Boost 1.69 or later (#47947) +* [GH-47948](https://github.com/apache/arrow/issues/47948) - [CI][Packaging][Deb] Add missing directory existent check (#47949) +* [GH-47953](https://github.com/apache/arrow/issues/47953) - [C++] Remove Windows inclusion from `int_util_overflow.h` (#47950) +* [GH-47955](https://github.com/apache/arrow/issues/47955) - [C++][Parquet] Support reading INT-encoded Decimal stats as Arrow scalar (#48001) +* [GH-47961](https://github.com/apache/arrow/issues/47961) - [C++] Fix Meson's Boost process version detection (#48017) +* [GH-47964](https://github.com/apache/arrow/issues/47964) - [Docs] Add dcleblanc/SafeInt to the LICENSE.txt file (#47965) +* [GH-47966](https://github.com/apache/arrow/issues/47966) - [Python] PyArrow v22.0 assumes Pandas DataFrame attrs are serializable (#47977) +* [GH-47967](https://github.com/apache/arrow/issues/47967) - [C++] Update Meson Configuration with SafeInt Changes (#47968) +* [GH-47970](https://github.com/apache/arrow/issues/47970) - [CI][C++] Fix a bug that JNI jobs runs nothing (#47972) +* [GH-47973](https://github.com/apache/arrow/issues/47973) - [C++][Parquet] Fix invalid Parquet files written when dictionary encoded pages are large (#47998) +* [GH-47981](https://github.com/apache/arrow/issues/47981) - [C++][Parquet] Add compatibility with non-compliant RLE stream (#47992) +* [GH-47983](https://github.com/apache/arrow/issues/47983) - [CI][R] R nightly upload workflow failing for a few weeks (#47984) +* [GH-48004](https://github.com/apache/arrow/issues/48004) - [C++][Parquet] Fix hang in ColumnReader benchmark (#48005) +* [GH-48010](https://github.com/apache/arrow/issues/48010) - [C++] Update bundled RE2 from 2022-06-01 to 2023-03-01 (#48011) +* [GH-48029](https://github.com/apache/arrow/issues/48029) - [R][CI] R nightly upload workflow failing in pruning step (#48030) +* [GH-48044](https://github.com/apache/arrow/issues/48044) - [Packaging][RPM][Parquet] Don't install `parquet-glib.pc` by `parquet-devel` (#48045) +* [GH-48046](https://github.com/apache/arrow/issues/48046) - [Docs][C++] Clarify "Exporting Tracing Information" section in OTel docs (#48047) +* [GH-48057](https://github.com/apache/arrow/issues/48057) - [R] Slow reading performance caused by apply_arrow_r_metadata() looping through all columns, including NULL ones (#48104) +* [GH-48062](https://github.com/apache/arrow/issues/48062) - [C++] Fix null pointer dereference in MakeExecBatch (#48063) +* [GH-48064](https://github.com/apache/arrow/issues/48064) - [C++] Set ARROW_BUILD_STATIC=ON when features-flight are enabled on CMake presets (#48065) +* [GH-48076](https://github.com/apache/arrow/issues/48076) - [C++][Flight] fix GeneratorStream for Tables (#48082) +* [GH-48079](https://github.com/apache/arrow/issues/48079) - [CI] Fix a typo in util_free_space.sh (#48088) +* [GH-48095](https://github.com/apache/arrow/issues/48095) - [Python][Docs] Add missing {pyarrow,compute} functions to API docs (#48117) +* [GH-48098](https://github.com/apache/arrow/issues/48098) - [R] Fix nightly libarrow binary uploads (#48100) +* [GH-48107](https://github.com/apache/arrow/issues/48107) - [CI] Update testing submodule (#48114) +* [GH-48115](https://github.com/apache/arrow/issues/48115) - [C++] Better align Meson configuration and config.h (#48116) +* [GH-48125](https://github.com/apache/arrow/issues/48125) - [C++] Remove gnu11 standard from the Meson configuration (#48126) +* [GH-48127](https://github.com/apache/arrow/issues/48127) - [R] stringr argument deprecation - add binding for stringr::str_ilike() and remove ignore_case argument for stringr::str_like() (#48262) +* [GH-48129](https://github.com/apache/arrow/issues/48129) - [CI] Stale issues bot only looks at 30 issues at a time (#48130) +* [GH-48134](https://github.com/apache/arrow/issues/48134) - [C++] Make StructArray::field() thread-safe (#48128) +* [GH-48142](https://github.com/apache/arrow/issues/48142) - [CI] Disallow scheduled GitHub Actions run on forked repos (#48143) +* [GH-48146](https://github.com/apache/arrow/issues/48146) - [C++][Parquet] Fix undefined behavior with invalid column/offset index (#48147) +* [GH-48162](https://github.com/apache/arrow/issues/48162) - [CI] Stale issues bot hit secondary rate limit and did not complete (#48165) +* [GH-48168](https://github.com/apache/arrow/issues/48168) - [C++][Parquet] Fix setting column-specific options when writing an encrypted Dataset (#48170) +* [GH-48234](https://github.com/apache/arrow/issues/48234) - [C++][Parquet] Fix overly strict check for BIT_PACKED levels byte size (#48235) +* [GH-48238](https://github.com/apache/arrow/issues/48238) - [C++] Actually write IPC schema endianness, not host endianness (#48239) +* [GH-48246](https://github.com/apache/arrow/issues/48246) - [C++][Parquet] Fix pre-1970 INT96 timestamps roundtrip (#48247) +* [GH-48263](https://github.com/apache/arrow/issues/48263) - [CI] Stale issues workflow doesn't go through enough issues (#48264) +* [GH-48268](https://github.com/apache/arrow/issues/48268) - [C++][Acero] Enhance the type checking for hash join residual filter (#48272) +* [GH-48280](https://github.com/apache/arrow/issues/48280) - [CI] PYTHON_PATCH_VERSION docker warnings (#48282) +* [GH-48283](https://github.com/apache/arrow/issues/48283) - [R][CI] Failures on R Lint on main (#48286) +* [GH-48308](https://github.com/apache/arrow/issues/48308) - [C++][Parquet] Fix potential crash when reading invalid Parquet data (#48309) +* [GH-48314](https://github.com/apache/arrow/issues/48314) - [Python] Compat with pandas 3.0 changed default datetime unit (#48319) +* [GH-48340](https://github.com/apache/arrow/issues/48340) - [R] respected `MAKEFLAGS` (#48341) +* [GH-48376](https://github.com/apache/arrow/issues/48376) - [C++] Update GoogleTest from 1.16.0 to 1.17.0 (#48377) +* [GH-48416](https://github.com/apache/arrow/issues/48416) - [Packaging][CI] Use custom orc_for_bundling when using FetchContent to avoid ar issues with + symbol on path (#48430) +* [GH-48417](https://github.com/apache/arrow/issues/48417) - [Packaging][CI] Skip downgrade testing for Debian testing (#48427) +* [GH-48432](https://github.com/apache/arrow/issues/48432) - [CI][Ruby] Don't run Red Arrow Format tests with Ruby 3.1 (#48434) +* [GH-48478](https://github.com/apache/arrow/issues/48478) - [Ruby] Fix Ruby list inference for nested non-negative integer arrays (#48584) +* [GH-48481](https://github.com/apache/arrow/issues/48481) - [Ruby] Correctly infer types for nested integer arrays (#48699) +* [GH-48540](https://github.com/apache/arrow/issues/48540) - [Python][C++][CI] test\_s3\_options crash on macOS +* [GH-48566](https://github.com/apache/arrow/issues/48566) - [C++][CI] Fix compilation on Valgrind job (#48567) +* [GH-48570](https://github.com/apache/arrow/issues/48570) - [C++] Add Missing Fuzz Sources to Meson configuration (#48571) +* [GH-48608](https://github.com/apache/arrow/issues/48608) - [Python] Fix interpolate actual values in Message.__repr__ f-string (#48656) +* [GH-48610](https://github.com/apache/arrow/issues/48610) - [Ruby] Add FixedSizeListArray glue (#48609) +* [GH-48625](https://github.com/apache/arrow/issues/48625) - [Python] Add temporal unit checking in NumPyDtypeUnifier (#48626) +* [GH-48641](https://github.com/apache/arrow/issues/48641) - [CI] Multiple nightly R builds failing due to ssache errors +* [GH-48725](https://github.com/apache/arrow/issues/48725) - [C++] Fix bundled Protobuf doesn't exist in libarrow_bundled_dependencies (#48726) +* [GH-48735](https://github.com/apache/arrow/issues/48735) - [CI][Python] Fix macOS wheel builds by forcing setuptools upgrade in venv (#48739) +* [GH-48736](https://github.com/apache/arrow/issues/48736) - [CI][Python] Restore AlmaLinux 8 support of `dev/release/setup-rhel-rebuilds.sh` for wheel verification (#48748) +* [GH-48741](https://github.com/apache/arrow/issues/48741) - [C++] Fix deadlock in CSV AsyncThreadedTableReader destructor (#48742) +* [GH-48750](https://github.com/apache/arrow/issues/48750) - [CI][Documentation] Disable Unity build for OpenTelemetry (#48751) +* [GH-48776](https://github.com/apache/arrow/issues/48776) - [CI][Ruby][Windows] Ensure removing temporary files (#48777) +* [GH-48780](https://github.com/apache/arrow/issues/48780) - [CI] Add missing permissions for reusable workflow calls (#48778) +* [GH-48782](https://github.com/apache/arrow/issues/48782) - [Docs][CI] Skip Markdown files with doxygen and trigger Docs job on PR when files are modified (#48786) +* [GH-48784](https://github.com/apache/arrow/issues/48784) - [GLib] Make (system) Parquet C++ is optional (#48785) +* [GH-48787](https://github.com/apache/arrow/issues/48787) - [C++] Disable `-Werror` for s2n-tls (#48791) +* [GH-48806](https://github.com/apache/arrow/issues/48806) - [CI][Packaging] ubuntu-noble-arm64 has failes for several days due to network failure (403 Forbidden [IP: 91.189.92.19 80]) +* [GH-48807](https://github.com/apache/arrow/issues/48807) - [CI] Clean up space on GitHub runner to fix manylinux wheel failure (#48790) +* [GH-48809](https://github.com/apache/arrow/issues/48809) - [CI] Fix homebrew-cpp with Mac by using formula-based dependency resolution (#48824) +* [GH-48811](https://github.com/apache/arrow/issues/48811) - [C++][FlightRPC] ODBC: Add missing `arrow::` to fix build (#48810) +* [GH-48827](https://github.com/apache/arrow/issues/48827) - [CI][Python] Add required xz dependency to emscripten dockerfile (#48828) +* [GH-48838](https://github.com/apache/arrow/issues/48838) - [Release] Use gh cli to download sources for Linux packages and publish draft release before verification (#48839) +* [GH-48841](https://github.com/apache/arrow/issues/48841) - [Release][Package] Add GH_TOKEN to rake build step on Linux Packaging jobs (#48842) + + +## New Features and Improvements + +* [GH-23970](https://github.com/apache/arrow/issues/23970) - [GLib] Add support for duration (#48564) +* [GH-24157](https://github.com/apache/arrow/issues/24157) - [C++] Add tests for DayTimeIntervalBuilder (#48709) +* [GH-31869](https://github.com/apache/arrow/issues/31869) - [Python][Parquet] Implement external key material features in Python (#48009) +* [GH-40735](https://github.com/apache/arrow/issues/40735) - [Packaging][CentOS] Drop support for CentOS 7 (#48550) +* [GH-41364](https://github.com/apache/arrow/issues/41364) - [GLib][Ruby] Allow passing thread pool to ExecutePlan (#48462) +* [GH-44810](https://github.com/apache/arrow/issues/44810) - [C++][Parquet] Add arrow::Result version of parquet::arrow::FileReader::Make() (#48285) +* [GH-45449](https://github.com/apache/arrow/issues/45449) - [R][CI] Remove OpenSSL 1.x builds (#48297) +* [GH-45484](https://github.com/apache/arrow/issues/45484) - [C++] Drop support for the gold linker (#47780) +* [GH-45885](https://github.com/apache/arrow/issues/45885) - [C++] Require C++20 (#48414) +* [GH-46004](https://github.com/apache/arrow/issues/46004) - [C++][FlightRPC] Enable ODBC Build In C++ Workflows (#47689) +* [GH-46096](https://github.com/apache/arrow/issues/46096) - [C++][FlightRPC] Environment and Connection Handle Allocation (#47759) +* [GH-46098](https://github.com/apache/arrow/issues/46098) - [C++][FlightRPC] ODBC Environment Attribute Implementation (#47760) +* [GH-46147](https://github.com/apache/arrow/issues/46147) - [C++] Implement GCS support in Meson (#47568) +* [GH-46411](https://github.com/apache/arrow/issues/46411) - [C++] Implemented dataset option in Meson (#47669) +* [GH-46465](https://github.com/apache/arrow/issues/46465) - [C++][FlightRPC] Refactor ODBC namespaces and file structure (#47703) +* [GH-46574](https://github.com/apache/arrow/issues/46574) - [C++][FlightRPC] ODBC Driver Connectivity support (#47971) +* [GH-46575](https://github.com/apache/arrow/issues/46575) - [C++][FlightRPC] Add Diagnostic tests (#47764) +* [GH-46575](https://github.com/apache/arrow/issues/46575) - [C++][FlightRPC] ODBC Diagnostics Report (#47763) +* [GH-46592](https://github.com/apache/arrow/issues/46592) - [CI][Dev][R] Add Air to pre-commit (#47423) +* [GH-46825](https://github.com/apache/arrow/issues/46825) - [R] Use smallest_decimal() from C++ instead of working out which decimal type to instantiate in R (#47906) +* [GH-46903](https://github.com/apache/arrow/issues/46903) - [CI] Automatically flag stale issues (#46904) +* [GH-47030](https://github.com/apache/arrow/issues/47030) - [C++][Parquet] Add setting to limit the number of rows written per page (#47090) +* [GH-47103](https://github.com/apache/arrow/issues/47103) - [Statistics][C++] Implement Statistics specification attribute ARROW:null_count:approximate (#47969) +* [GH-47105](https://github.com/apache/arrow/issues/47105) - [Statistics][C++] Implement Statistics specification attribute ARROW:row_count:approximate (#48266) +* [GH-47196](https://github.com/apache/arrow/issues/47196) - [CI][C++] Add Windows ARM64 build (#47811) +* [GH-47437](https://github.com/apache/arrow/issues/47437) - [CI][Python] Update win wheels and free-threaded build for Python 3.14 +* [GH-47441](https://github.com/apache/arrow/issues/47441) - [Python][Parquet] Allow passing write_time_adjusted_to_utc to Python's ParquetWriter (#47745) +* [GH-47572](https://github.com/apache/arrow/issues/47572) - [C++][Parquet] Uniform unpack interface (#47573) +* [GH-47635](https://github.com/apache/arrow/issues/47635) - [CI][Integration] Add new gold files (#47729) +* [GH-47640](https://github.com/apache/arrow/issues/47640) - [CI] Remove needless ci/docker/ubuntu-22.04-csharp.dockerfile (#48298) +* [GH-47643](https://github.com/apache/arrow/issues/47643) - [Python][Packaging] Enable CMAKE_INTERPROCEDURAL_OPTIMIZATION for wheels (#47733) +* [GH-47677](https://github.com/apache/arrow/issues/47677) - [C++][GPU] Allow building with CUDA 13 (#48259) +* [GH-47697](https://github.com/apache/arrow/issues/47697) - [C++][FlightRPC] Add ODBC API placeholders (#47725) +* [GH-47706](https://github.com/apache/arrow/issues/47706) - [C++][FlightRPC] ODBC SQLFreeStmt implementation (#48033) +* [GH-47707](https://github.com/apache/arrow/issues/47707) - [C++][FlightRPC] Add tests for descriptor handle allocation (#48053) +* [GH-47708](https://github.com/apache/arrow/issues/47708) - [C++][FlightRPC] Connection Attribute Support for ODBC (#47772) +* [GH-47710](https://github.com/apache/arrow/issues/47710) - [C++][FlightRPC] Statement attribute Support in ODBC (#47773) +* [GH-47711](https://github.com/apache/arrow/issues/47711) - [C++][FlightRPC] Enable ODBC query execution (#48032) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC SQLMoreResults implementation (#48035) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC return number of result columns (#48036) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC return number of affected rows (#48037) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC Basic Data Retrieval (#48034) +* [GH-47714](https://github.com/apache/arrow/issues/47714) - [C++][FlightRPC] ODBC extended fetch (#48040) +* [GH-47715](https://github.com/apache/arrow/issues/47715) - [C++][FlightRPC] ODBC scroll fetch implementation (#48041) +* [GH-47716](https://github.com/apache/arrow/issues/47716) - [C++][FlightRPC] ODBC bind column implementation (#48042) +* [GH-47717](https://github.com/apache/arrow/issues/47717) - [C++][FlightRPC] ODBC close cursor (#48043) +* [GH-47719](https://github.com/apache/arrow/issues/47719) - [C++][FlightRPC] Extract SQLTables Implementation (#48021) +* [GH-47720](https://github.com/apache/arrow/issues/47720) - [C++][FlightRPC] ODBC Columns Metadata (#48049) +* [GH-47721](https://github.com/apache/arrow/issues/47721) - [C++][FlightRPC] Followup to remove unncessary std::move to resolve compliation flakiness (#48687) +* [GH-47721](https://github.com/apache/arrow/issues/47721) - [C++][FlightRPC] Return ODBC Column Attribute from result set (#48050) +* [GH-47722](https://github.com/apache/arrow/issues/47722) - [C++][FlightRPC] ODBC Data Type Information (#48051) +* [GH-47723](https://github.com/apache/arrow/issues/47723) - [C++][FlightRPC] ODBC SQLNativeSQL implementation (#48020) +* [GH-47724](https://github.com/apache/arrow/issues/47724) - [C++][FlightRPC] ODBC: implement SQLDescribeCol (#48052) +* [GH-47726](https://github.com/apache/arrow/issues/47726) - [C++][FlightRPC] ODBC Unicode Support (#47771) +* [GH-47728](https://github.com/apache/arrow/issues/47728) - [Python] Check the source argument in parquet.read_table (#48008) +* [GH-47747](https://github.com/apache/arrow/issues/47747) - [C++] Bump Apache ORC to 2.2.1 (#47744) +* [GH-47753](https://github.com/apache/arrow/issues/47753) - [C++][Parquet] Build Thrift with OpenSSL disabled (#47754) +* [GH-47756](https://github.com/apache/arrow/issues/47756) - [C++][CI] Fuzz CSV reader (#47757) +* [GH-47767](https://github.com/apache/arrow/issues/47767) - [CI] Add date to extra CI report email subject (#47777) +* [GH-47784](https://github.com/apache/arrow/issues/47784) - [C++] Patch vendored pcg library to enable msvc arm64 intrinsics (#47779) +* [GH-47786](https://github.com/apache/arrow/issues/47786) - [C++][FlightRPC] Establish ODBC tests (#47788) +* [GH-47787](https://github.com/apache/arrow/issues/47787) - [C++][FlightRPC] ODBC `msi` Windows installer (#48054) +* [GH-47789](https://github.com/apache/arrow/issues/47789) - [C++][FlightRPC] SQLGetFunctions Tests (#48031) +* [GH-47797](https://github.com/apache/arrow/issues/47797) - [CI][Python] Update Python installs for free-threaded wheel tasks (#47993) +* [GH-47800](https://github.com/apache/arrow/issues/47800) - [C++][CI] Fuzz more CSV reader types (#48398) +* [GH-47806](https://github.com/apache/arrow/issues/47806) - [CI] Rename deprecated docker-compose.yml to preferred compose.yaml file (#47954) +* [GH-47833](https://github.com/apache/arrow/issues/47833) - [C++] Add utf8proc option to Meson configuration (#47834) +* [GH-47881](https://github.com/apache/arrow/issues/47881) - [C++] Update fast_float version to 8.1.0 (#47882) +* [GH-47887](https://github.com/apache/arrow/issues/47887) - [C++][Integration] Enable extension types with C Data Interface tests (#47888) +* [GH-47891](https://github.com/apache/arrow/issues/47891) - [C++][Parquet] Generate a separate fuzz seed file for each column (#47892) +* [GH-47895](https://github.com/apache/arrow/issues/47895) - [C++][Parquet] Add prolog and epilog in unpack (#47896) +* [GH-47905](https://github.com/apache/arrow/issues/47905) - [C++][Parquet] MakeColumnStats should use user-provided memory pool (#47894) +* [GH-47926](https://github.com/apache/arrow/issues/47926) - [C++] Adopt alternative safe arithmetic library (#47958) +* [GH-47936](https://github.com/apache/arrow/issues/47936) - [R] docgen.R requires installed package instead of current working code (#47940) +* [GH-47939](https://github.com/apache/arrow/issues/47939) - [R] Update CRAN packaging checklist to update checksums and have make build call make clean (#47944) +* [GH-47974](https://github.com/apache/arrow/issues/47974) - [Docs] Remove stray documentation from Java and JS (#48006) +* [GH-47975](https://github.com/apache/arrow/issues/47975) - [Docs][Python] Remove experimental warning on PyCapsule documentation (#47976) +* [GH-47978](https://github.com/apache/arrow/issues/47978) - [C++][Parquet][CI] Add more compression codecs to fuzzing seed corpus (#47979) +* [GH-48000](https://github.com/apache/arrow/issues/48000) - [CI][Release] Publish RC GitHub Release as draft to allow immutable releases (#48059) +* [GH-48013](https://github.com/apache/arrow/issues/48013) - [R] Add CI job for musl (Alpine Linux) to replicate CRAN checks (#48014) +* [GH-48025](https://github.com/apache/arrow/issues/48025) - [C++][GLib] Replace instances where build path is being added to built artifacts (#48026) +* [GH-48055](https://github.com/apache/arrow/issues/48055) - [C++][FlightRPC] Allow spaces while parsing Table Type in ODBC (#48056) +* [GH-48074](https://github.com/apache/arrow/issues/48074) - [C++] Use FetchContent for bundled Abseil (#48075) +* [GH-48084](https://github.com/apache/arrow/issues/48084) - [C++][FlightRPC] Replace boost::optional with std::optional (#48323) +* [GH-48089](https://github.com/apache/arrow/issues/48089) - [C++][Parquet] Read statistics and other metadata when fuzzing (#48090) +* [GH-48091](https://github.com/apache/arrow/issues/48091) - [C++] Use FetchContent for bundled c-ares (#48092) +* [GH-48096](https://github.com/apache/arrow/issues/48096) - [Python][Parquet] Expose new WriterProperties::max_rows_per_page to Python bindings (#48101) +* [GH-48102](https://github.com/apache/arrow/issues/48102) - [Python] Remove deprecated Array.format method (#48324) +* [GH-48105](https://github.com/apache/arrow/issues/48105) - [C++][Parquet][IPC] Cap allocated memory when fuzzing (#48108) +* [GH-48112](https://github.com/apache/arrow/issues/48112) - [C++][Parquet] Use more accurate data length estimate when decoding PLAIN BYTE_ARRAY data (#48113) +* [GH-48123](https://github.com/apache/arrow/issues/48123) - [C++][Float16] Reimplement arrow::WithinUlp and Enable it for float16 (#48224) +* [GH-48139](https://github.com/apache/arrow/issues/48139) - [C++] Allow compilation for QNX versions up to 8 (#48140) +* [GH-48152](https://github.com/apache/arrow/issues/48152) - [CI][MATLAB] Bump MATLAB release to R2025b in the MATLAB GitHub Actions Workflow (#48153) +* [GH-48154](https://github.com/apache/arrow/issues/48154) - [MATAB][Packaging] Update MATLAB crossbow workflow to build against MATLAB `R2025b` (#48155) +* [GH-48163](https://github.com/apache/arrow/issues/48163) - [CI][Docs] Update preview docs task S3 secret to use (#48164) +* [GH-48167](https://github.com/apache/arrow/issues/48167) - [Python][C++][Compute] Add python bindings for scatter, inverse_permutation (#48267) +* [GH-48174](https://github.com/apache/arrow/issues/48174) - [CI][Dev] Fix shellcheck errors in ci/scripts/util_download_apache.sh (#48175) +* [GH-48176](https://github.com/apache/arrow/issues/48176) - [C++][Parquet] Fix arrow-ipc-message-internal-test failure (#48166) +* [GH-48178](https://github.com/apache/arrow/issues/48178) - [C++] Use FetchContent for bundled RE2 (#48179) +* [GH-48181](https://github.com/apache/arrow/issues/48181) - [C++] Use FetchContent for bundled Protobuf (#48183) +* [GH-48186](https://github.com/apache/arrow/issues/48186) - [CI][Dev] Remove ci/scripts/util_wait_for_it.sh (#48189) +* [GH-48218](https://github.com/apache/arrow/issues/48218) - [C++][Parquet] Fix Util & Level Conversion logic on big-endian (#48219) +* [GH-48245](https://github.com/apache/arrow/issues/48245) - [C++][Parquet] Simplify GetVlqInt (#48237) +* [GH-48248](https://github.com/apache/arrow/issues/48248) - [C++] Use FetchContent for bundled gRPC (#48250) +* [GH-48251](https://github.com/apache/arrow/issues/48251) - [C++][CI] Add CSV fuzzing seed corpus generator (#48252) +* [GH-48256](https://github.com/apache/arrow/issues/48256) - [Packaging][Linux] Use `closer.lua?action=download` URL (#48257) +* [GH-48260](https://github.com/apache/arrow/issues/48260) - [C++][Python][R] Move S3 bucket references to new bucket as Voltron Data ones will be removed soon (#48261) +* [GH-48275](https://github.com/apache/arrow/issues/48275) - [C++][Dev] Allow choosing verbosity when fuzzing (#48276) +* [GH-48287](https://github.com/apache/arrow/issues/48287) - [Ruby] Add minimum pure Ruby Apache Arrow reader implementation (#48288) +* [GH-48292](https://github.com/apache/arrow/issues/48292) - [Ruby] Add `Arrow::Column#to_arrow{,_array,_chunked_array}` (#48293) +* [GH-48295](https://github.com/apache/arrow/issues/48295) - [Ruby] Add support for reading Int8 array (#48296) +* [GH-48303](https://github.com/apache/arrow/issues/48303) - [CI] Remove needless `setup-dotnet` from `.github/workflows/dev.yml` (#48304) +* [GH-48306](https://github.com/apache/arrow/issues/48306) - [Ruby] Add support for reading binary array (#48307) +* [GH-48312](https://github.com/apache/arrow/issues/48312) - [C++][FlightRPC] Standalone ODBC MSVC CI (#48313) +* [GH-48315](https://github.com/apache/arrow/issues/48315) - [C++] Use FetchContent for bundled CRC32C (#48318) +* [GH-48316](https://github.com/apache/arrow/issues/48316) - [C++] Use FetchContent for bundled nlohmann-json (#48320) +* [GH-48317](https://github.com/apache/arrow/issues/48317) - [C++] Use FetchContent for bundled google-cloud-cpp (#48333) +* [GH-48326](https://github.com/apache/arrow/issues/48326) - [CI] Stop specifying hash for `actions/*` GitHub Actions (#48327) +* [GH-48328](https://github.com/apache/arrow/issues/48328) - [Ruby] Add support for reading UTF-8 array (#48329) +* [GH-48330](https://github.com/apache/arrow/issues/48330) - [Ruby] Add support for reading null array (#48331) +* [GH-48335](https://github.com/apache/arrow/issues/48335) - [C++][Parquet] Fuzz encrypted files (#48336) +* [GH-48337](https://github.com/apache/arrow/issues/48337) - [C++][Parquet] Improve column encryption API (#48338) +* [GH-48339](https://github.com/apache/arrow/issues/48339) - [C++] Enhance functions in util/ubsan.h to support types without a default constructor (#48429) +* [GH-48342](https://github.com/apache/arrow/issues/48342) - [R] Turn off gcs by default, also if it is on, bundle. (#48343) +* [GH-48346](https://github.com/apache/arrow/issues/48346) - [Ruby] Add support for reading boolean array (#48348) +* [GH-48347](https://github.com/apache/arrow/issues/48347) - [Ruby] Add support for reading list array (#48351) +* [GH-48355](https://github.com/apache/arrow/issues/48355) - [Python] Remove obsolete snprintf workaround for Python 3.9 (#48354) +* [GH-48358](https://github.com/apache/arrow/issues/48358) - [Ruby] Add support for reading float32 array (#48359) +* [GH-48360](https://github.com/apache/arrow/issues/48360) - [Ruby] Add support for reading large binary array (#48361) +* [GH-48362](https://github.com/apache/arrow/issues/48362) - [GLib][Ruby] Add FixedSizeListArray (#48369) +* [GH-48363](https://github.com/apache/arrow/issues/48363) - [GLib][Ruby] Add AssumeTimezoneOptions (#48370) +* [GH-48364](https://github.com/apache/arrow/issues/48364) - [GLib][Ruby] Add CumulativeOptions (#48371) +* [GH-48365](https://github.com/apache/arrow/issues/48365) - [GLib][Ruby] Add DayOfWeekOptions (#48372) +* [GH-48366](https://github.com/apache/arrow/issues/48366) - [GLib][Ruby] Add DictionaryEncodeOptions (#48373) +* [GH-48367](https://github.com/apache/arrow/issues/48367) - [GLib][Ruby] Add ElementWiseAggregateOptions (#48374) +* [GH-48368](https://github.com/apache/arrow/issues/48368) - [GLib][Ruby] Add ExtractRegexOptions (#48375) +* [GH-48380](https://github.com/apache/arrow/issues/48380) - [Ruby] Add support for reading float64 array (#48381) +* [GH-48382](https://github.com/apache/arrow/issues/48382) - [Ruby] Add support for reading struct array (#48383) +* [GH-48384](https://github.com/apache/arrow/issues/48384) - [C++][Docs][Parquet] Fix broken link for parquet-format spec (#48385) +* [GH-48386](https://github.com/apache/arrow/issues/48386) - [Ruby][Dev] Enable Layout/TrailingEmptyLines: final_newline cop (#48392) +* [GH-48388](https://github.com/apache/arrow/issues/48388) - [Ruby] Add support for reading map array (#48389) +* [GH-48395](https://github.com/apache/arrow/issues/48395) - [C++][Dev] Update fuzzing CMake preset (#48396) +* [GH-48400](https://github.com/apache/arrow/issues/48400) - [Python] Convert an old todo to a proper ticket in `test_copy_files_directory` (#48401) +* [GH-48402](https://github.com/apache/arrow/issues/48402) - [Python] Enable the relative path in test_write_dataset (#48403) +* [GH-48404](https://github.com/apache/arrow/issues/48404) - [Python] Add tests to to_table(filter=...) to reject a boolean expr (#48405) +* [GH-48406](https://github.com/apache/arrow/issues/48406) - [Python] Negative test for struct_field no-argument (ARROW-14853) (#48407) +* [GH-48410](https://github.com/apache/arrow/issues/48410) - [Ruby] Add support for reading large list array (#48411) +* [GH-48412](https://github.com/apache/arrow/issues/48412) - [Ruby] Add support for reading date32 array (#48413) +* [GH-48419](https://github.com/apache/arrow/issues/48419) - [Python] Fix test_parquet_file_too_small to catch only ArrowInvalid (#48420) +* [GH-48421](https://github.com/apache/arrow/issues/48421) - [Python] Enable test_orc_scan_options with batch_size (#48422) +* [GH-48423](https://github.com/apache/arrow/issues/48423) - [Ruby] Add support for reading date64 array (#48424) +* [GH-48425](https://github.com/apache/arrow/issues/48425) - [Ruby] Add support for reading dense union array (#48426) +* [GH-48435](https://github.com/apache/arrow/issues/48435) - [Ruby] Add support for reading sparse union array (#48439) +* [GH-48437](https://github.com/apache/arrow/issues/48437) - [Ruby] Add tests for large list array (#48438) +* [GH-48440](https://github.com/apache/arrow/issues/48440) - [Ruby] Add support for reading time32 array (#48441) +* [GH-48442](https://github.com/apache/arrow/issues/48442) - [Python] Remove workaround that excluded struct types from `chunked_arrays` (#48443) +* [GH-48444](https://github.com/apache/arrow/issues/48444) - [Python] Remove todo of implementing requested_schema in test_roundtrip_reader_capsule (#48445) +* [GH-48446](https://github.com/apache/arrow/issues/48446) - [Python] Remove todo of schema=name mismatch in `record_batches` (#48447) +* [GH-48452](https://github.com/apache/arrow/issues/48452) - [Python] Add tests for Date32 and Date64 array creation with masks (#48453) +* [GH-48461](https://github.com/apache/arrow/issues/48461) - [R][CI] Migrate Azure pipelines to GitHub actions (#48585) +* [GH-48463](https://github.com/apache/arrow/issues/48463) - [Python] Improve error message in CheckTypeExact arrow_to_pandas.cc (#48464) +* [GH-48471](https://github.com/apache/arrow/issues/48471) - [Ruby] Add support for reading Int16 and UInt16 arrays (#48472) +* [GH-48475](https://github.com/apache/arrow/issues/48475) - [Ruby] Add support for reading Int32 and UInt32 arrays (#48476) +* [GH-48479](https://github.com/apache/arrow/issues/48479) - [Ruby] Add support for reading Int64 and UInt64 arrays (#48480) +* [GH-48482](https://github.com/apache/arrow/issues/48482) - [GLib][Ruby] Add GArrowExtractRegexSpanOptions (#48483) +* [GH-48484](https://github.com/apache/arrow/issues/48484) - [GLib][Ruby] Add GArrowJoinOptions (#48485) +* [GH-48486](https://github.com/apache/arrow/issues/48486) - [GLib][Ruby] Add GArrowListFlattenOptions (#48487) +* [GH-48488](https://github.com/apache/arrow/issues/48488) - [GLib][Ruby] Add GArrowListSliceOptions (#48489) +* [GH-48490](https://github.com/apache/arrow/issues/48490) - [GLib][Ruby] Add GArrowMakeStructOptions (#48491) +* [GH-48492](https://github.com/apache/arrow/issues/48492) - [GLib][Ruby] Add MapLookupOptions (#48513) +* [GH-48493](https://github.com/apache/arrow/issues/48493) - [GLib][Ruby] Add ModeOptions (#48514) +* [GH-48494](https://github.com/apache/arrow/issues/48494) - [GLib][Ruby] Add NullOptions (#48515) +* [GH-48495](https://github.com/apache/arrow/issues/48495) - [GLib][Ruby] Add PadOptions (#48516) +* [GH-48496](https://github.com/apache/arrow/issues/48496) - [GLib][Ruby] Add PairwiseOptions (#48517) +* [GH-48497](https://github.com/apache/arrow/issues/48497) - [GLib][Ruby] Add PartitionNthOptions (#48518) +* [GH-48498](https://github.com/apache/arrow/issues/48498) - [GLib][Ruby] Add PivotWiderOptions (#48519) +* [GH-48499](https://github.com/apache/arrow/issues/48499) - [GLib][Ruby] Add RankQuantileOptions (#48520) +* [GH-48500](https://github.com/apache/arrow/issues/48500) - [GLib][Ruby] Add ReplaceSliceOptions (#48521) +* [GH-48501](https://github.com/apache/arrow/issues/48501) - [GLib][Ruby] Add ReplaceSubstringOptions (#48522) +* [GH-48502](https://github.com/apache/arrow/issues/48502) - [GLib][Ruby] Add RoundBinaryOptions (#48523) +* [GH-48503](https://github.com/apache/arrow/issues/48503) - [GLib][Ruby] Add RoundTemporalOptions (#48524) +* [GH-48504](https://github.com/apache/arrow/issues/48504) - [GLib][Ruby] Add SelectKOptions (#48525) +* [GH-48505](https://github.com/apache/arrow/issues/48505) - [GLib][Ruby] Add SkewOptions (#48526) +* [GH-48506](https://github.com/apache/arrow/issues/48506) - [GLib][Ruby] Add SliceOptions (#48527) +* [GH-48507](https://github.com/apache/arrow/issues/48507) - [GLib][Ruby] Add SplitOptions (#48528) +* [GH-48508](https://github.com/apache/arrow/issues/48508) - [GLib][Ruby] Add TDigestOptions (#48529) +* [GH-48509](https://github.com/apache/arrow/issues/48509) - [GLib][Ruby] Add TrimOptions (#48530) +* [GH-48510](https://github.com/apache/arrow/issues/48510) - [GLib][Ruby] Add WeekOptions (#48531) +* [GH-48511](https://github.com/apache/arrow/issues/48511) - [GLib][Ruby] Add WinsorizeOptions (#48532) +* [GH-48512](https://github.com/apache/arrow/issues/48512) - [GLib][Ruby] Add ZeroFillOptions (#48533) +* [GH-48535](https://github.com/apache/arrow/issues/48535) - [Ruby] Add support for reading time64 array (#48536) +* [GH-48537](https://github.com/apache/arrow/issues/48537) - [Ruby] Add support for reading fixed size binary array (#48538) +* [GH-48545](https://github.com/apache/arrow/issues/48545) - [C++][Parquet][CI] Add more encodings to fuzzing seed corpus (#48546) +* [GH-48551](https://github.com/apache/arrow/issues/48551) - [Ruby] Add support for reading large UTF-8 array (#48552) +* [GH-48553](https://github.com/apache/arrow/issues/48553) - [Ruby] Add support for reading timestamp array (#48554) +* [GH-48555](https://github.com/apache/arrow/issues/48555) - [C++] Use FetchContent for bundled opentelemetry (#48556) +* [GH-48557](https://github.com/apache/arrow/issues/48557) - [C++][Parquet][CI] Also encrypt nested columns in fuzz seed corpus (#48558) +* [GH-48572](https://github.com/apache/arrow/issues/48572) - [CI] Remove centos-7-cpp dockerfile and reference from compose (#48573) +* [GH-48579](https://github.com/apache/arrow/issues/48579) - [Ruby] Add support for reading duration array (#48580) +* [GH-48582](https://github.com/apache/arrow/issues/48582) - [CI][GPU][C++][Python] Add new CUDA jobs using the new self-hosted runners (#48583) +* [GH-48592](https://github.com/apache/arrow/issues/48592) - [C++] Use starts_with/ends_with methods (#48614) +* [GH-48602](https://github.com/apache/arrow/issues/48602) - [Ruby] Add support for reading interval arrays (#48603) +* [GH-48606](https://github.com/apache/arrow/issues/48606) - [CI][GLib] Increase NuGet timeout for vcpkg cache (#48638) +* [GH-48612](https://github.com/apache/arrow/issues/48612) - [Ruby] Add support for reading streaming format (#48613) +* [GH-48616](https://github.com/apache/arrow/issues/48616) - [GLib] Use `Arrow-${MAJOR}.${MINOR}.typelib` not `Arrow-1.0.typelib` (#48617) +* [GH-48631](https://github.com/apache/arrow/issues/48631) - [R] Non-API calls: 'ATTRIB', 'SET_ATTRIB' (#48634) +* [GH-48632](https://github.com/apache/arrow/issues/48632) - [R] Add NEWS.md entry for 22.0.0.1 (#48633) +* [GH-48642](https://github.com/apache/arrow/issues/48642) - [Ruby] Add support for reading decimal128 array (#48643) +* [GH-48654](https://github.com/apache/arrow/issues/48654) - [Python] Test timestamp from int without pandas dependency (#48655) +* [GH-48667](https://github.com/apache/arrow/issues/48667) - [Python] Remove unused imports from `python/pyarrow/__init__.py` (#48640) +* [GH-48668](https://github.com/apache/arrow/issues/48668) - [Python][Docs] Add python examples for compute functions `min/max/min_max` (#48648) +* [GH-48675](https://github.com/apache/arrow/issues/48675) - [C++][FlightRPC] Document StatementAttributeId enum values in ODBC SPI (#48676) +* [GH-48680](https://github.com/apache/arrow/issues/48680) - [GLib][Ruby] Add CSVWriter (#48681) +* [GH-48684](https://github.com/apache/arrow/issues/48684) - [C++] Update MakeListArray to use ListArray::FromArrays instead of constructor (#48685) +* [GH-48690](https://github.com/apache/arrow/issues/48690) - [R] Make "Can read Parquet files from a URL" less flaky (#48693) +* [GH-48703](https://github.com/apache/arrow/issues/48703) - [Ruby] Add support for reading decimal256 array (#48704) +* [GH-48705](https://github.com/apache/arrow/issues/48705) - [Ruby] Add support for reading dictionary array (#48706) +* [GH-48707](https://github.com/apache/arrow/issues/48707) - [C++][FlightRPC] Use IRD precision/scale defaults with ARD override in SQLGetData (#48708) +* [GH-48752](https://github.com/apache/arrow/issues/48752) - [Ruby] Skip ChunkedArray test on Windows due to flakiness (#48779) +* [GH-48755](https://github.com/apache/arrow/issues/48755) - [MATLAB] Rename getArrayProxyIDs to getProxyIDs (#48756) +* [GH-48757](https://github.com/apache/arrow/issues/48757) - [CI] Update arrow/.github /CODEOWNERS (#48758) +* [GH-48770](https://github.com/apache/arrow/issues/48770) - [CI] Add missing permissions declaration to workflows (#48771) + + + # Apache Arrow 6.0.1 (2021-11-18) ## Bug Fixes From 45781e854cc9248013349decc4639644871d9f5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 13 Jan 2026 14:05:29 +0100 Subject: [PATCH 06/36] MINOR: [Release] Update .deb/.rpm changelogs for 23.0.0 --- .../linux-packages/apache-arrow-apt-source/debian/changelog | 6 ++++++ .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++++++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 18 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index f65042f2875..6c99f51ee2d 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (23.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 13 Jan 2026 13:05:28 -0000 + apache-arrow-apt-source (22.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index b5796afa5e4..0579df694f0 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -85,6 +85,9 @@ else fi %changelog +* Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 +- New upstream release. + * Mon Oct 20 2025 Raúl Cumplido - 22.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 3239216a63e..0f18ddaefda 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (23.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 13 Jan 2026 13:05:28 -0000 + apache-arrow (22.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 8cc272c35ae..7bf8bd556a9 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -877,6 +877,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 +- New upstream release. + * Mon Oct 20 2025 Raúl Cumplido - 22.0.0-1 - New upstream release. From eafe3a9e620cf94683dee2347f370c35156dc965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 13 Jan 2026 14:05:35 +0100 Subject: [PATCH 07/36] MINOR: [Release] Update versions for 23.0.0 --- c_glib/meson.build | 2 +- c_glib/vcpkg.json | 2 +- ci/scripts/PKGBUILD | 2 +- cpp/CMakeLists.txt | 2 +- cpp/meson.build | 2 +- cpp/vcpkg.json | 2 +- dev/tasks/homebrew-formulae/apache-arrow-glib.rb | 2 +- dev/tasks/homebrew-formulae/apache-arrow.rb | 2 +- docs/source/_static/versions.json | 9 +++++++-- matlab/CMakeLists.txt | 2 +- python/CMakeLists.txt | 2 +- python/pyproject.toml | 2 +- r/DESCRIPTION | 2 +- r/NEWS.md | 2 +- r/pkgdown/assets/versions.html | 5 +++-- r/pkgdown/assets/versions.json | 8 ++++++-- ruby/red-arrow-cuda/lib/arrow-cuda/version.rb | 2 +- ruby/red-arrow-dataset/lib/arrow-dataset/version.rb | 2 +- .../red-arrow-flight-sql/lib/arrow-flight-sql/version.rb | 2 +- ruby/red-arrow-flight/lib/arrow-flight/version.rb | 2 +- ruby/red-arrow-format/lib/arrow-format/version.rb | 2 +- ruby/red-arrow/lib/arrow/version.rb | 2 +- ruby/red-gandiva/lib/gandiva/version.rb | 2 +- ruby/red-parquet/lib/parquet/version.rb | 2 +- 24 files changed, 37 insertions(+), 27 deletions(-) diff --git a/c_glib/meson.build b/c_glib/meson.build index fddd390063e..ef020350748 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -32,7 +32,7 @@ project( # * 22.04: 0.61.2 # * 24.04: 1.3.2 meson_version: '>=0.61.2', - version: '23.0.0-SNAPSHOT', + version: '23.0.0', ) version = meson.project_version() diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index 67c9958df4b..b7aa1ce8863 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "23.0.0-SNAPSHOT", + "version-string": "23.0.0", "$comment:dependencies": "We can enable gobject-introspection again once it's updated", "dependencies": [ "glib", diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b0add262e83..ff95e15c2f7 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=22.0.0.9000 +pkgver=23.0.0 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5b260c0eb68..f3e0105262e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -96,7 +96,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "23.0.0-SNAPSHOT") +set(ARROW_VERSION "23.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/meson.build b/cpp/meson.build index 16bb844d089..30623eb6541 100644 --- a/cpp/meson.build +++ b/cpp/meson.build @@ -19,7 +19,7 @@ project( 'arrow', 'cpp', 'c', - version: '23.0.0-SNAPSHOT', + version: '23.0.0', license: 'Apache-2.0', meson_version: '>=1.3.0', default_options: ['c_std=c11', 'warning_level=2', 'cpp_std=c++20'], diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 41c40fcc85f..07d7344e0bc 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version-string": "23.0.0-SNAPSHOT", + "version-string": "23.0.0", "dependencies": [ "abseil", { diff --git a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb index 5993f696566..035fa7b1b84 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb @@ -29,7 +29,7 @@ class ApacheArrowGlib < Formula desc "GLib bindings for Apache Arrow" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0-SNAPSHOT/apache-arrow-23.0.0-SNAPSHOT.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0/apache-arrow-23.0.0.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index f93a56f7f23..a6ee05289f2 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -29,7 +29,7 @@ class ApacheArrow < Formula desc "Columnar in-memory analytics layer designed to accelerate big data" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0-SNAPSHOT/apache-arrow-23.0.0-SNAPSHOT.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0/apache-arrow-23.0.0.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index 6feaa86e1a7..4a4d2c948c0 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -1,15 +1,20 @@ [ { - "name": "23.0 (dev)", + "name": "24.0 (dev)", "version": "dev/", "url": "https://arrow.apache.org/docs/dev/" }, { - "name": "22.0 (stable)", + "name": "23.0 (stable)", "version": "", "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "22.0", + "version": "22.0/", + "url": "https://arrow.apache.org/docs/22.0/" + }, { "name": "21.0", "version": "21.0/", diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 9fa737f687a..dbcc4edf792 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -100,7 +100,7 @@ endfunction() set(CMAKE_CXX_STANDARD 20) -set(MLARROW_VERSION "23.0.0-SNAPSHOT") +set(MLARROW_VERSION "23.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") project(mlarrow VERSION "${MLARROW_BASE_VERSION}") diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b1c8e324942..d550796a7af 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -28,7 +28,7 @@ project(pyarrow) # which in turn meant that Py_GIL_DISABLED was not set. set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) -set(PYARROW_VERSION "23.0.0-SNAPSHOT") +set(PYARROW_VERSION "23.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" PYARROW_BASE_VERSION "${PYARROW_VERSION}") # Generate SO version and full SO version diff --git a/python/pyproject.toml b/python/pyproject.toml index 0a730fd4f78..f137a79c832 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -95,4 +95,4 @@ root = '..' version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' -fallback_version = '23.0.0a0' +fallback_version = '23.0.0' diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 86ca441263e..0ac5e36ea6d 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 22.0.0.9000 +Version: 23.0.0 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index e9f7a591ced..3af9e1185e4 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,7 @@ under the License. --> -# arrow 22.0.0.9000 +# arrow 23.0.0 # arrow 22.0.0.1 ## Minor improvements and fixes diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index c90d4ae2164..76c30f8f252 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,8 @@ -

22.0.0.9000 (dev)

-

22.0.0 (release)

+

23.0.0.9000 (dev)

+

23.0.0 (release)

+

22.0.0

21.0.0

20.0.0

19.0.1

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 0d783995062..8b2f0471fe5 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,12 +1,16 @@ [ { - "name": "22.0.0.9000 (dev)", + "name": "23.0.0.9000 (dev)", "version": "dev/" }, { - "name": "22.0.0 (release)", + "name": "23.0.0 (release)", "version": "" }, + { + "name": "22.0.0", + "version": "22.0/" + }, { "name": "21.0.0", "version": "21.0/" diff --git a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index 6cd19389f46..4cef86c65fa 100644 --- a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowCUDA - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb index 46d7339fb33..3b0c83b3c8d 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowDataset - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb index 25ed5f2bb35..4337f4bc1c7 100644 --- a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb +++ b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlightSQL - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/ruby/red-arrow-flight/lib/arrow-flight/version.rb index 6961134c6c8..69fcc9e667b 100644 --- a/ruby/red-arrow-flight/lib/arrow-flight/version.rb +++ b/ruby/red-arrow-flight/lib/arrow-flight/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlight - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-format/lib/arrow-format/version.rb b/ruby/red-arrow-format/lib/arrow-format/version.rb index 389bd4dc5ea..0d1bb36ce1f 100644 --- a/ruby/red-arrow-format/lib/arrow-format/version.rb +++ b/ruby/red-arrow-format/lib/arrow-format/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFormat - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow/lib/arrow/version.rb b/ruby/red-arrow/lib/arrow/version.rb index 4e8bf057f52..1f74a5960af 100644 --- a/ruby/red-arrow/lib/arrow/version.rb +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -16,7 +16,7 @@ # under the License. module Arrow - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb index 5b409db58fe..afef421030e 100644 --- a/ruby/red-gandiva/lib/gandiva/version.rb +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -16,7 +16,7 @@ # under the License. module Gandiva - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb index 6e4c1cd95ab..ccce6defe4c 100644 --- a/ruby/red-parquet/lib/parquet/version.rb +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -16,7 +16,7 @@ # under the License. module Parquet - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.0" module Version numbers, TAG = VERSION.split("-") From 6901d3652c70b131d53d5486a1f25074216fe630 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 13 Jan 2026 20:36:07 +0100 Subject: [PATCH 08/36] GH-48844: [C++] Check IPC Message body length consistency in IPC file (#48845) ### Rationale for this change The IPC file exposes [redundant information](https://github.com/apache/arrow/blob/d54a2051cf9020a0fdf50836420c38ad14787abb/format/File.fbs#L39-L50) about Message sizes so as to allow for random access from the file footer. We tried adding [consistency checks](https://github.com/apache/arrow/issues/19596) in the past but this hit a bug in the JavaScript IPC writer at the time, so the checks were left disabled. The JavaScript implementation was fixed soon after (7 years ago), so this PR re-enables those checks so as to more easily detect potentially invalid IPC files. ### Are these changes tested? By existing tests. ### Are there any user-facing changes? No, unless they try reading invalid IPC files. * GitHub Issue: #48844 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/message.cc | 2 ++ cpp/src/arrow/ipc/reader.cc | 33 +++++++++++++++++++-------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 7919878f148..8be09956f10 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -375,6 +375,8 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le decoder.next_required_size()); } + // TODO(GH-48846): we should take a body_length just like ReadMessageAsync + // and read metadata + body in one go. ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length)); if (metadata->size() < metadata_length) { return Status::Invalid("Expected to read ", metadata_length, diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4910b1596c3..6a20dbb8c85 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1180,31 +1180,36 @@ Status CheckAligned(const FileBlock& block) { return Status::OK(); } +template +Result CheckBodyLength(MessagePtr message, const FileBlock& block) { + if (message->body_length() != block.body_length) { + return Status::Invalid( + "Mismatching body length for IPC message " + "(Block.bodyLength: ", + block.body_length, " vs. Message.bodyLength: ", message->body_length(), ")"); + } + // NOTE: we cannot check metadata length as easily as we would have to account + // for the additional IPC signalisation (such as optional continuation bytes). + return message; +} + Result> ReadMessageFromBlock( const FileBlock& block, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader) { RETURN_NOT_OK(CheckAligned(block)); - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, file, fields_loader)); - return message; + return CheckBodyLength(std::move(message), block); } Future> ReadMessageFromBlockAsync( const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) { - if (!bit_util::IsMultipleOf8(block.offset) || - !bit_util::IsMultipleOf8(block.metadata_length) || - !bit_util::IsMultipleOf8(block.body_length)) { - return Status::Invalid("Unaligned block in IPC file"); - } - - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - + RETURN_NOT_OK(CheckAligned(block)); return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file, - io_context); + io_context) + .Then([block](std::shared_ptr message) { + return CheckBodyLength(std::move(message), block); + }); } class RecordBatchFileReaderImpl; From 90cce42803b95703887f658ace0f212a43732dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 15 Jan 2026 01:21:15 +0100 Subject: [PATCH 09/36] GH-48856: [Release] Update copyright NOTICE year to 2026 (#48857) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Date is currently wrong ### What changes are included in this PR? Update Copyright Notice to cover year 2026 ### Are these changes tested? No ### Are there any user-facing changes? No breaking change but Yes in terms of copyright date updated. * GitHub Issue: #48856 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- NOTICE.txt | 2 +- ruby/red-arrow-format/NOTICE.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index 9b98364d2ab..8046f20a0b9 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Arrow -Copyright 2016-2024 The Apache Software Foundation +Copyright 2016-2026 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/ruby/red-arrow-format/NOTICE.txt b/ruby/red-arrow-format/NOTICE.txt index 1b218efe168..8916e188af1 100644 --- a/ruby/red-arrow-format/NOTICE.txt +++ b/ruby/red-arrow-format/NOTICE.txt @@ -1,2 +1,2 @@ Apache Arrow -Copyright 2025 The Apache Software Foundation +Copyright 2025-2026 The Apache Software Foundation From 34bf0e2e5c6d67ed2d76586987e91816fa04dab2 Mon Sep 17 00:00:00 2001 From: chegoryu Date: Thu, 15 Jan 2026 02:48:32 +0100 Subject: [PATCH 10/36] GH-48311: [C++] Fix OOB memory access in buffered IO (#48322) ### Rationale for this change Fixing: https://github.com/apache/arrow/issues/48311 ### What changes are included in this PR? Applied fix from https://github.com/apache/arrow/issues/48311 and added test ### Are these changes tested? Yes, added test, without my patch test fails with debug check: ```cpp Note: Google Test filter = TestBufferedInputStream.PeekAfterExhaustingBuffer [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from TestBufferedInputStream [ RUN ] TestBufferedInputStream.PeekAfterExhaustingBuffer /Users/chegoryu/Junk/git/arrow/cpp/src/arrow/io/buffered.cc:337: Check failed: buffer_->size() - buffer_pos_ >= nbytes ``` ### Are there any user-facing changes? No, this PR fixes a bug * GitHub Issue: #48311 Lead-authored-by: Egor Chunaev Co-authored-by: mwish Co-authored-by: chegoryu Signed-off-by: mwish --- cpp/src/arrow/io/buffered.cc | 15 ++++++++++---- cpp/src/arrow/io/buffered_test.cc | 33 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0dae888ca0e..14a0fe4215e 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -285,8 +285,9 @@ class BufferedInputStream::Impl : public BufferedBase { // Resize internal read buffer. Note that the internal buffer-size // should not be larger than the raw_read_bound_. - // It might change the buffer_size_, but will not change buffer states - // buffer_pos_ and bytes_buffered_. + // It might change the buffer_size_, and may reset buffer_pos_ to 0 + // when bytes_buffered_ == 0 to reuse the beginning of the buffer. + // bytes_buffered_ will not be changed. Status SetBufferSize(int64_t new_buffer_size) { if (new_buffer_size <= 0) { return Status::Invalid("Buffer size should be positive"); @@ -297,12 +298,14 @@ class BufferedInputStream::Impl : public BufferedBase { new_buffer_size, ", buffer_pos: ", buffer_pos_, ", bytes_buffered: ", bytes_buffered_, ", buffer_size: ", buffer_size_); } + bool need_reset_buffer_pos = false; if (raw_read_bound_ >= 0) { // No need to reserve space for more than the total remaining number of bytes. if (bytes_buffered_ == 0) { - // Special case: we can not keep the current buffer because it does not + // Special case: we can override data in the current buffer because it does not // contain any required data. new_buffer_size = std::min(new_buffer_size, raw_read_bound_ - raw_read_total_); + need_reset_buffer_pos = true; } else { // We should keep the current buffer because it contains data that // can be read. @@ -311,7 +314,11 @@ class BufferedInputStream::Impl : public BufferedBase { buffer_pos_ + bytes_buffered_ + (raw_read_bound_ - raw_read_total_)); } } - return ResizeBuffer(new_buffer_size); + auto status = ResizeBuffer(new_buffer_size); + if (status.ok() && need_reset_buffer_pos) { + buffer_pos_ = 0; + } + return status; } Result Peek(int64_t nbytes) { diff --git a/cpp/src/arrow/io/buffered_test.cc b/cpp/src/arrow/io/buffered_test.cc index 1d4805f580c..efaec09dc7c 100644 --- a/cpp/src/arrow/io/buffered_test.cc +++ b/cpp/src/arrow/io/buffered_test.cc @@ -514,6 +514,39 @@ TEST_F(TestBufferedInputStream, PeekPastBufferedBytes) { ASSERT_EQ(0, buffered_->bytes_buffered()); } +TEST_F(TestBufferedInputStream, PeekAfterExhaustingBuffer) { + // GH-48311: When bytes_buffered_ == 0 and raw_read_bound_ >= 0, + // SetBufferSize should reset buffer_pos_ to 0 and reuse the beginning of the buffer + MakeExample1(/*buffer_size=*/10, default_memory_pool(), /*raw_read_bound=*/25); + + // Fill the buffer + ASSERT_OK_AND_ASSIGN(auto view, buffered_->Peek(10)); + EXPECT_EQ(view, kExample1.substr(0, 10)); + ASSERT_EQ(10, buffered_->bytes_buffered()); + ASSERT_EQ(10, buffered_->buffer_size()); + + // Read all buffered bytes to exhaust the buffer (bytes_buffered_ == 0), + // at this point buffer_pos_ is non-zero + ASSERT_OK_AND_ASSIGN(auto bytes, buffered_->Read(10)); + EXPECT_EQ(std::string_view(*bytes), kExample1.substr(0, 10)); + ASSERT_EQ(0, buffered_->bytes_buffered()); + ASSERT_EQ(10, buffered_->buffer_size()); + + // Peek should trigger SetBufferSize with bytes_buffered_ == 0, + // which should reset buffer_pos_ to 0 and reuse the beginning of the buffer, + // so resulting size of the buffer should be 15 instead of 25 + ASSERT_OK_AND_ASSIGN(view, buffered_->Peek(15)); + EXPECT_EQ(view, kExample1.substr(10, 15)); + ASSERT_EQ(15, buffered_->bytes_buffered()); + ASSERT_EQ(15, buffered_->buffer_size()); + + // Do read just in case + ASSERT_OK_AND_ASSIGN(bytes, buffered_->Read(15)); + EXPECT_EQ(std::string_view(*bytes), kExample1.substr(10, 15)); + ASSERT_EQ(0, buffered_->bytes_buffered()); + ASSERT_EQ(15, buffered_->buffer_size()); +} + class TestBufferedInputStreamBound : public ::testing::Test { public: void SetUp() { CreateExample(/*bounded=*/true); } From 9ecfdf911b8261fde39d06cee2b8ea9c066e6eb6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 13 Jan 2026 12:08:03 +0900 Subject: [PATCH 11/36] GH-48623: [CI][Archery][Dev] Add missing headers to email reports (#48624) ### Rationale for this change Our email reports miss the following headers: * `MIME-Version: 1.0` * `Content-Type: text/plain; charset="utf-8"` * `Message-Id: ${AUTO_GENERATED_MESSAGE_ID}` * `Date: ${DATE_IN_RFC_2822}` ### What changes are included in this PR? Add these headers. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #48623 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp_extra.yml | 2 + .github/workflows/package_linux.yml | 2 + .github/workflows/r_extra.yml | 38 +++++++------ dev/archery/archery/ci/cli.py | 28 ++++++++-- dev/archery/archery/crossbow/cli.py | 54 ++++++++++++------ dev/archery/archery/crossbow/reports.py | 43 ++++++++++++--- ...il-report.txt => nightly-email-report.txt} | 5 ++ .../token-expiration-email-report.txt | 14 +++++ .../archery/crossbow/tests/test_reports.py | 55 ++++++++++++++++--- .../templates/email_nightly_report.txt.j2 | 10 +--- .../templates/email_token_expiration.txt.j2 | 11 ++-- .../templates/email_workflow_report.txt.j2 | 10 +--- 12 files changed, 193 insertions(+), 79 deletions(-) rename dev/archery/archery/crossbow/tests/fixtures/{email-report.txt => nightly-email-report.txt} (83%) create mode 100644 dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 7ab4c73270d..7844b0b0112 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -39,6 +39,7 @@ on: - 'ci/scripts/util_*' - 'cpp/**' - 'compose.yaml' + - 'dev/archery/archery/**' - 'format/Flight.proto' - 'testing' tags: @@ -61,6 +62,7 @@ on: - 'ci/scripts/util_*' - 'cpp/**' - 'compose.yaml' + - 'dev/archery/archery/**' - 'format/Flight.proto' - 'testing' types: diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml index c59784d7f58..4dc9a70e879 100644 --- a/.github/workflows/package_linux.yml +++ b/.github/workflows/package_linux.yml @@ -29,6 +29,7 @@ on: - '.github/workflows/report_ci.yml' - 'cpp/**' - 'c_glib/**' + - 'dev/archery/archery/**' - 'dev/release/binary-task.rb' - 'dev/release/verify-apt.sh' - 'dev/release/verify-yum.sh' @@ -43,6 +44,7 @@ on: - '.github/workflows/report_ci.yml' - 'cpp/**' - 'c_glib/**' + - 'dev/archery/archery/**' - 'dev/release/binary-task.rb' - 'dev/release/verify-apt.sh' - 'dev/release/verify-yum.sh' diff --git a/.github/workflows/r_extra.yml b/.github/workflows/r_extra.yml index 687a4e0aa05..443d2354d7f 100644 --- a/.github/workflows/r_extra.yml +++ b/.github/workflows/r_extra.yml @@ -27,15 +27,16 @@ on: - '.github/workflows/check_labels.yml' - '.github/workflows/r_extra.yml' - '.github/workflows/report_ci.yml' - - "ci/docker/**" - - "ci/etc/rprofile" - - "ci/scripts/PKGBUILD" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/install_minio.sh" - - "ci/scripts/r_*.sh" - - "cpp/**" - - "compose.yaml" - - "r/**" + - 'ci/docker/**' + - 'ci/etc/rprofile' + - 'ci/scripts/PKGBUILD' + - 'ci/scripts/cpp_*.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/r_*.sh' + - 'cpp/**' + - 'compose.yaml' + - 'dev/archery/archery/**' + - 'r/**' tags: - '**' pull_request: @@ -44,15 +45,16 @@ on: - '.github/workflows/check_labels.yml' - '.github/workflows/r_extra.yml' - '.github/workflows/report_ci.yml' - - "ci/docker/**" - - "ci/etc/rprofile" - - "ci/scripts/PKGBUILD" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/install_minio.sh" - - "ci/scripts/r_*.sh" - - "cpp/**" - - "compose.yaml" - - "r/**" + - 'ci/docker/**' + - 'ci/etc/rprofile' + - 'ci/scripts/PKGBUILD' + - 'ci/scripts/cpp_*.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/r_*.sh' + - 'cpp/**' + - 'compose.yaml' + - 'dev/archery/archery/**' + - 'r/**' types: - labeled - opened diff --git a/dev/archery/archery/ci/cli.py b/dev/archery/archery/ci/cli.py index bf7b68d5327..5597dff733e 100644 --- a/dev/archery/archery/ci/cli.py +++ b/dev/archery/archery/ci/cli.py @@ -73,6 +73,22 @@ def report_chat(obj, workflow_id, send, repository, ignore, webhook, output.write(report_chat.render("workflow_report")) +class WorkflowEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('workflow_report', **kwargs) + + def date(self): + return self.report.datetime + + def subject(self): + workflow = self.report + date = self.date().strftime('%Y-%m-%d') + return ( + f'[{date}] Arrow Build Report for Job {workflow.name}: ' + f'{len(workflow.failed_jobs())} failed' + ) + + @ci.command() @click.argument('workflow_id', required=True) @click.option('--sender-name', '-n', @@ -105,9 +121,10 @@ def report_email(obj, workflow_id, sender_name, sender_email, recipient_email, """ output = obj['output'] - email_report = EmailReport( - report=Workflow(workflow_id, repository, - ignore_job=ignore, gh_token=obj['github_token']), + workflow = Workflow(workflow_id, repository, + ignore_job=ignore, gh_token=obj['github_token']) + email_report = WorkflowEmailReport( + report=workflow, sender_name=sender_name, sender_email=sender_email, recipient_email=recipient_email @@ -119,8 +136,7 @@ def report_email(obj, workflow_id, sender_name, sender_email, recipient_email, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=email_report.render("workflow_report") + report=email_report ) else: - output.write(email_report.render("workflow_report")) + output.write(str(email_report.render())) diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py index c73c4d1ff7e..10aa3dedf44 100644 --- a/dev/archery/archery/crossbow/cli.py +++ b/dev/archery/archery/crossbow/cli.py @@ -343,6 +343,22 @@ def latest_prefix(obj, prefix, fetch): click.echo(latest.branch) +class NightlyEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('nightly_report', **kwargs) + + def subject(self): + report = self.report + n_errors = len(report.tasks_by_state['error']) + n_failures = len(report.tasks_by_state['failure']) + n_pendings = len(report.tasks_by_state['pending']) + return ( + f'[NIGHTLY] Arrow Build Report for Job {report.job.branch}: ' + f'{n_errors + n_failures} failed, ' + f'{n_pendings} pending' + ) + + @crossbow.command() @click.argument('job-name', required=True) @click.option('--sender-name', '-n', @@ -382,8 +398,9 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, queue.fetch() job = queue.get(job_name) - email_report = EmailReport( - report=Report(job), + report = Report(job) + email_report = NightlyEmailReport( + report=report, sender_name=sender_name, sender_email=sender_email, recipient_email=recipient_email @@ -401,11 +418,10 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=email_report.render("nightly_report") + report=email_report ) else: - output.write(email_report.render("nightly_report")) + output.write(str(email_report.render())) @crossbow.command() @@ -601,6 +617,17 @@ def batch_gen(iterable, step): print(batch) +class TokenExpirationEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('token_expiration', **kwargs) + + def subject(self): + token_expiration_date = self.report.token_expiration_date + return ( + f'[CI] Arrow Crossbow Token Expiration in {token_expiration_date}' + ) + + @crossbow.command() @click.option('--days', default=30, help='Notification will be sent if expiration date is ' @@ -645,23 +672,18 @@ def __init__(self, token_expiration_date, days_left): self.token_expiration_date = token_expiration_date self.days_left = days_left - email_report = EmailReport( - report=TokenExpirationReport( - token_expiration_date or "ALREADY_EXPIRED", days_left), - sender_name=sender_name, - sender_email=sender_email, - recipient_email=recipient_email - ) + if not token_expiration_date: + token_expiration_date = 'ALREADY_EXPIRED' + report = TokenExpirationReport(token_expiration_date, days_left) + email_report = TokenExpirationEmailReport(report) - message = email_report.render("token_expiration").strip() if send: ReportUtils.send_email( smtp_user=smtp_user, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=message + report=email_report ) else: - output.write(message) + output.write(str(email_report.render())) diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index 32962410d6e..a2c0487a2b1 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -17,6 +17,10 @@ import collections import csv +import datetime +import email.headerregistry +import email.message +import email.utils import operator import fnmatch import functools @@ -246,7 +250,7 @@ def send_message(cls, webhook, message): @classmethod def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, - recipient_email, message): + report): from smtplib import SMTP, SMTP_SSL if smtp_port == 465: @@ -259,7 +263,8 @@ def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, else: smtp.starttls() smtp.login(smtp_user, smtp_password) - smtp.sendmail(smtp_user, recipient_email, message) + message = report.render() + smtp.send_message(smtp_user, report.recipient_email, message) @classmethod def write_csv(cls, report, add_headers=True): @@ -271,11 +276,6 @@ def write_csv(cls, report, add_headers=True): class EmailReport(JinjaReport): - templates = { - 'nightly_report': 'email_nightly_report.txt.j2', - 'token_expiration': 'email_token_expiration.txt.j2', - 'workflow_report': 'email_workflow_report.txt.j2', - } fields = [ 'report', 'sender_name', @@ -283,6 +283,35 @@ class EmailReport(JinjaReport): 'recipient_email', ] + def __init__(self, template_name, **kwargs): + self._template_name = template_name + super().__init__(**kwargs) + + @property + def templates(self): + return { + self._template_name: f'email_{self._template_name}.txt.j2', + } + + def date(self): + return None + + def render(self): + message = email.message.EmailMessage() + message.set_charset('utf-8') + message['Message-Id'] = email.utils.make_msgid() + date = self.date() + if isinstance(date, datetime.datetime): + message['Date'] = date + else: + message['Date'] = email.utils.formatdate(date) + message['From'] = email.headerregistry.Address( + self.sender_name, addr_spec=self.sender_email) + message['To'] = email.headerregistry.Address(addr_spec=self.recipient_email) + message['Subject'] = self.subject() + message.set_content(super().render(self._template_name)) + return message + class CommentReport(Report): diff --git a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt b/dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt similarity index 83% rename from dev/archery/archery/crossbow/tests/fixtures/email-report.txt rename to dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt index c29cafd3938..5e7b8e9c67d 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt @@ -1,6 +1,11 @@ +MIME-Version: 1.0 +Message-Id: +Date: date From: Sender Reporter To: recipient@arrow.com Subject: [NIGHTLY] Arrow Build Report for Job ursabot-1: 2 failed, 1 pending +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit Arrow Build Report for Job ursabot-1 diff --git a/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt b/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt new file mode 100644 index 00000000000..1f8ccbf30c6 --- /dev/null +++ b/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt @@ -0,0 +1,14 @@ +MIME-Version: 1.0 +Message-Id: +Date: date +From: Sender Reporter +To: recipient@arrow.com +Subject: [CI] Arrow Crossbow Token Expiration in 2026-01-17 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +The Arrow Crossbow Token will expire in 7 days. + +Please generate a new Token. Send it to Apache INFRA to update the +CROSSBOW_GITHUB_TOKEN. Update it on the crossbow repository and in +the Azure pipelines. diff --git a/dev/archery/archery/crossbow/tests/test_reports.py b/dev/archery/archery/crossbow/tests/test_reports.py index 620b4c78bbc..02012d2f1be 100644 --- a/dev/archery/archery/crossbow/tests/test_reports.py +++ b/dev/archery/archery/crossbow/tests/test_reports.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. +import re import textwrap +from archery.crossbow.cli import (NightlyEmailReport, TokenExpirationEmailReport) from archery.crossbow.core import yaml -from archery.crossbow.reports import (ChatReport, CommentReport, EmailReport, - Report) +from archery.crossbow.reports import (ChatReport, CommentReport, Report) def test_crossbow_comment_formatter(load_fixture): @@ -71,19 +72,55 @@ def test_crossbow_chat_report_extra_message_success(load_fixture): assert report_chat.render("text") == textwrap.dedent(expected_msg) -def test_crossbow_email_report(load_fixture): - expected_msg = load_fixture('email-report.txt') +def test_crossbow_nightly_email_report(load_fixture): + expected_msg = load_fixture('nightly-email-report.txt') job = load_fixture('crossbow-job.yaml', decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - email_report = EmailReport(report=report, sender_name="Sender Reporter", - sender_email="sender@arrow.com", - recipient_email="recipient@arrow.com") + email_report = NightlyEmailReport( + report=report, + sender_name='Sender Reporter', + sender_email='sender@arrow.com', + recipient_email='recipient@arrow.com' + ) - assert ( - email_report.render("nightly_report") == textwrap.dedent(expected_msg) + actual = str(email_report.render()) + # Normalize dynamic headers + actual = re.sub(r'(?m)^Message-Id: <.+?>', + 'Message-Id: ', + actual) + actual = re.sub(r'(?m)^Date: [^\n]+ -0000$', + 'Date: date', + actual) + assert actual == textwrap.dedent(expected_msg) + + +def test_crossbow_token_expiration_email_report(load_fixture): + expected_msg = load_fixture('token-expiration-email-report.txt') + + class TokenExpirationReport: + def __init__(self, token_expiration_date, days_left): + self.token_expiration_date = token_expiration_date + self.days_left = days_left + + report = TokenExpirationReport('2026-01-17', 7) + email_report = TokenExpirationEmailReport( + report=report, + sender_name='Sender Reporter', + sender_email='sender@arrow.com', + recipient_email='recipient@arrow.com' ) + actual = str(email_report.render()) + # Normalize dynamic headers + actual = re.sub(r'(?m)^Message-Id: <.+?>', + 'Message-Id: ', + actual) + actual = re.sub(r'(?m)^Date: [^\n]+ -0000$', + 'Date: date', + actual) + assert actual == textwrap.dedent(expected_msg) + def test_crossbow_export_report(load_fixture): job = load_fixture('crossbow-job.yaml', decoder=yaml.load) diff --git a/dev/archery/archery/templates/email_nightly_report.txt.j2 b/dev/archery/archery/templates/email_nightly_report.txt.j2 index bc040734b03..7b43d7c867e 100644 --- a/dev/archery/archery/templates/email_nightly_report.txt.j2 +++ b/dev/archery/archery/templates/email_nightly_report.txt.j2 @@ -15,13 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -{%- if True -%} -{%- endif -%} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [NIGHTLY] Arrow Build Report for Job {{report.job.branch}}: {{ (report.tasks_by_state["error"] | length) + (report.tasks_by_state["failure"] | length) }} failed, {{ report.tasks_by_state["pending"] | length }} pending - +-#} Arrow Build Report for Job {{ report.job.branch }} See https://s3.amazonaws.com/arrow-data/index.html for more information. @@ -58,4 +52,4 @@ Succeeded Tasks: - {{ task_name }} {{ report.task_url(task) }} {% endfor %} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/dev/archery/archery/templates/email_token_expiration.txt.j2 b/dev/archery/archery/templates/email_token_expiration.txt.j2 index 54c2005e57e..340cb4a5353 100644 --- a/dev/archery/archery/templates/email_token_expiration.txt.j2 +++ b/dev/archery/archery/templates/email_token_expiration.txt.j2 @@ -15,12 +15,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [CI] Arrow Crossbow Token Expiration in {{ report.token_expiration_date }} - +-#} The Arrow Crossbow Token will expire in {{ report.days_left }} days. -Please generate a new Token. Send it to Apache INFRA to update the CROSSBOW_GITHUB_TOKEN. -Update it on the crossbow repository and in the Azure pipelines. +Please generate a new Token. Send it to Apache INFRA to update the +CROSSBOW_GITHUB_TOKEN. Update it on the crossbow repository and in +the Azure pipelines. diff --git a/dev/archery/archery/templates/email_workflow_report.txt.j2 b/dev/archery/archery/templates/email_workflow_report.txt.j2 index 193856c1806..6668d6c67ee 100644 --- a/dev/archery/archery/templates/email_workflow_report.txt.j2 +++ b/dev/archery/archery/templates/email_workflow_report.txt.j2 @@ -15,13 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -{%- if True -%} -{%- endif -%} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [{{ report.datetime.strftime('%Y-%m-%d') }}] Arrow Build Report for {{ report.name }}: {{ report.failed_jobs() | length }} failed - +-#} Arrow Build Report for {{ report.name }} Workflow URL: {{ report.url }} @@ -42,4 +36,4 @@ Succeeded Jobs: - {{ job.name }} {{ job.url }} {% endfor %} -{%- endif -%} \ No newline at end of file +{%- endif -%} From 4fe8f42944c413cdc322469dd76d29bfa51e19a4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 16 Jan 2026 17:44:54 +0900 Subject: [PATCH 12/36] GH-48861: [CI] Fix wrong `smtplib.SMTP.send_message` usage (#48876) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change https://docs.python.org/3/library/smtplib.html#smtplib.SMTP.sendmail uses the third argument as e-mail content but https://docs.python.org/3/library/smtplib.html#smtplib.SMTP.send_message uses the first argument as e-mail. ### What changes are included in this PR? * Pass e-mail as the first argument * Remove redundant from and to addresses * They are extracted from the given e-mail automatically ### Are these changes tested? Yes. I sent a test e-mail manually. ### Are there any user-facing changes? No. * GitHub Issue: #48861 Authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- dev/archery/archery/crossbow/reports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index a2c0487a2b1..1c6510ea4f3 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -264,7 +264,7 @@ def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, smtp.starttls() smtp.login(smtp_user, smtp_password) message = report.render() - smtp.send_message(smtp_user, report.recipient_email, message) + smtp.send_message(message) @classmethod def write_csv(cls, report, add_headers=True): From 8abac092ca4270a391093177a84a3729b5a3d92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 20 Jan 2026 15:32:01 +0100 Subject: [PATCH 13/36] GH-48894: [Python][C++] Use base Azure::Core::RequestFailedException instead of final Azure::Storage::StorageException and set minimum nodejs on conda env to 16 for Azurite to work (#48895) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change nodejs 12 is currently being installed on conda. CI jobs are failing and or segfaulting due to azurite failing with old versions. ``` 2026-01-13T18:32:39.6961900Z #15 [ 9/11] RUN /arrow/ci/scripts/install_azurite.sh 2026-01-13T18:32:39.9624124Z #15 0.417 Node.js version = v12.4.0 2026-01-13T18:32:42.2087322Z #15 2.663 npm WARN deprecated rimraf@ 3.0.2: Rimraf versions prior to v4 are no longer supported 2026-01-13T18:32:42.3917601Z #15 2.846 npm WARN deprecated uuid@ 3.4.0: Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details. 2026-01-13T18:32:51.4870197Z #15 11.94 npm WARN deprecated glob@ 7.2.3: Glob versions prior to v9 are no longer supported 2026-01-13T18:32:51.7035681Z #15 12.01 npm WARN deprecated inflight@ 1.0.6: This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful. 2026-01-13T18:33:02.1406491Z #15 22.59 /opt/conda/envs/arrow/bin/azurite -> /opt/conda/envs/arrow/lib/node_modules/azurite/dist/src/azurite.js 2026-01-13T18:33:02.3841290Z #15 22.60 /opt/conda/envs/arrow/bin/azurite-queue -> /opt/conda/envs/arrow/lib/node_modules/azurite/dist/src/queue/main.js 2026-01-13T18:33:02.3842792Z #15 22.60 /opt/conda/envs/arrow/bin/azurite-blob -> /opt/conda/envs/arrow/lib/node_modules/azurite/dist/src/blob/main.js 2026-01-13T18:33:02.3844216Z #15 22.60 /opt/conda/envs/arrow/bin/azurite-table -> /opt/conda/envs/arrow/lib/node_modules/azurite/dist/src/table/main.js 2026-01-13T18:33:02.3846002Z #15 22.66 npm WARN applicationinsights@ 2.9.8 requires a peer of applicationinsights-native-metrics@* but none is installed. You must install peer dependencies yourself. 2026-01-13T18:33:02.3847278Z #15 22.66 2026-01-13T18:33:02.3847564Z #15 22.66 + azurite@ 3.35.0 2026-01-13T18:33:02.3848038Z #15 22.66 added 376 packages from 296 contributors in 20.644s 2026-01-13T18:33:02.3848830Z #15 22.69 /opt/conda/envs/arrow/bin/azurite 2026-01-13T18:33:02.8929329Z #15 23.35 /opt/conda/envs/arrow/lib/node_modules/azurite/node_modules/fs-extra/lib/util/async.js:14 2026-01-13T18:33:02.8930231Z #15 23.35 (err) => err ?? new Error('unknown error') 2026-01-13T18:33:02.8930740Z #15 23.35 ^ ``` The job on PyArrow was segfaulting due to an Exception being thrown but not catch. In general we were using `Azure::Storage::StorageException` but `Azure::Core::Http::TransportException` could also be thrown on some cases. Bot are final but inherit from `Azure::Core::RequestFailedException`. ### What changes are included in this PR? - Pin minimum nodejs version to 16 so the failure doesn't happen again. - Update catching `Azure::Storage::StorageException` to `Azure::Core::RequestFailedException` so `Azure::Core::Http::TransportException` is also catch. ### Are these changes tested? Yes on CI. ### Are there any user-facing changes? No * GitHub Issue: #48894 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ci/conda_env_cpp.txt | 2 +- cpp/src/arrow/filesystem/azurefs.cc | 83 +++++++++++++++-------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 18d58f7bb2d..fec8488f954 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -39,7 +39,7 @@ lz4-c make meson ninja -nodejs +nodejs>=16 orc<2.1.0 pkg-config python diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a3a162616ec..6580476d38c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -558,7 +558,7 @@ Status CrossContainerMoveNotImplemented(const AzureLocation& src, "' requires moving data between containers, which is not implemented."); } -bool IsContainerNotFound(const Storage::StorageException& e) { +bool IsContainerNotFound(const Core::RequestFailedException& e) { // In some situations, only the ReasonPhrase is set and the // ErrorCode is empty, so we check both. if (e.ErrorCode == "ContainerNotFound" || @@ -782,7 +782,7 @@ class ObjectInputFile final : public io::RandomAccessFile { content_length_ = properties.Value.BlobSize; metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return PathNotFound(location_); } @@ -864,7 +864,7 @@ class ObjectInputFile final : public io::RandomAccessFile { return blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "DownloadTo from '", blob_client_->GetUrl(), "' at position ", position, " for ", nbytes, @@ -916,7 +916,7 @@ class ObjectInputFile final : public io::RandomAccessFile { Status CreateEmptyBlockBlob(const Blobs::BlockBlobClient& block_blob_client) { try { block_blob_client.UploadFrom(nullptr, 0); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "UploadFrom failed for '", block_blob_client.GetUrl(), "'. There is no existing blob at this location or the existing blob must be " @@ -929,7 +929,7 @@ Result GetBlockList( std::shared_ptr block_blob_client) { try { return block_blob_client->GetBlockList().Value; - } catch (Storage::StorageException& exception) { + } catch (Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "GetBlockList failed for '", block_blob_client->GetUrl(), "'. Cannot write to a file without first fetching the existing block list."); @@ -945,7 +945,7 @@ Status CommitBlockList(std::shared_ptr block_bl // previously committed blocks. // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body block_blob_client->CommitBlockList(block_ids, options); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "CommitBlockList failed for '", block_blob_client->GetUrl(), "'. Committing is required to flush an output/append stream."); @@ -957,7 +957,7 @@ Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string& Core::IO::MemoryBodyStream& content) { try { block_blob_client->StageBlock(id, content); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "StageBlock failed for '", block_blob_client->GetUrl(), "' new_block_id: '", id, @@ -1023,7 +1023,7 @@ class ObjectAppendStream final : public io::OutputStream { } content_length_ = properties.Value.BlobSize; pos_ = content_length_; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { // No file exists but on flat namespace its possible there is a directory // marker or an implied directory. Ensure there is no directory before starting @@ -1366,7 +1366,7 @@ Result CheckIfHierarchicalNamespaceIsEnabled( // Azurite issue detected. DCHECK(IsDfsEmulator(options)); return HNSSupport::kDisabled; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { // Flat namespace storage accounts with "soft delete" enabled return // // "Conflict - This endpoint does not support BlobStorageEvents @@ -1400,9 +1400,6 @@ Result CheckIfHierarchicalNamespaceIsEnabled( "Check for Hierarchical Namespace support on '", adlfs_client.GetUrl(), "' failed."); } - } catch (const Azure::Core::Http::TransportException& exception) { - return ExceptionToStatus(exception, "Check for Hierarchical Namespace support on '", - adlfs_client.GetUrl(), "' failed."); } catch (const std::exception& exception) { return Status::UnknownError( "Check for Hierarchical Namespace support on '", adlfs_client.GetUrl(), @@ -1436,7 +1433,7 @@ Result GetContainerPropsAsFileInfo(const AzureLocation& location, info.set_type(FileType::Directory); info.set_mtime(std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { info.set_type(FileType::NotFound); return info; @@ -1452,7 +1449,7 @@ Status CreateContainerIfNotExists(const std::string& container_name, try { container_client.CreateIfNotExists(); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create a container: ", container_name, ": ", container_client.GetUrl()); } @@ -1545,7 +1542,7 @@ class LeaseGuard { DCHECK(release_attempt_pending_); try { lease_client_->Release(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to release the ", lease_client_->GetLeaseId(), " lease"); } @@ -1588,7 +1585,7 @@ class LeaseGuard { break_or_expires_at_ = std::min(break_or_expires_at_, SteadyClock::now() + break_period.ValueOr(std::chrono::seconds{0})); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to break the ", lease_client_->GetLeaseId(), " lease expiring in ", remaining_time_ms().count(), "ms"); @@ -1783,7 +1780,7 @@ class AzureFileSystem::Impl { info.set_mtime( std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return FileInfo{location.all, FileType::NotFound}; } @@ -1858,7 +1855,7 @@ class AzureFileSystem::Impl { } info.set_type(FileType::NotFound); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { return FileInfo{location.all, FileType::NotFound}; } @@ -1918,7 +1915,7 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(on_container(container)); } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to list account containers."); } return Status::OK(); @@ -1973,7 +1970,7 @@ class AzureFileSystem::Impl { } } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception) || exception.ErrorCode == "PathNotFound") { found = false; } else { @@ -2086,7 +2083,7 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { found = false; } else { @@ -2225,7 +2222,7 @@ class AzureFileSystem::Impl { if (container_info.type() == FileType::NotFound) { try { container_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create directory '", location.all, "': ", container_client.GetUrl()); } @@ -2252,7 +2249,7 @@ class AzureFileSystem::Impl { const auto& nonexistent_location = nonexistent_locations[i - 1]; try { create_if_not_exists(container_client, nonexistent_location); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create directory '", location.all, "': ", container_client.GetUrl()); } @@ -2270,7 +2267,7 @@ class AzureFileSystem::Impl { try { create_if_not_exists(container_client, location); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { auto parent = location.parent(); return PathNotFound(parent); @@ -2378,7 +2375,7 @@ class AzureFileSystem::Impl { try { EnsureEmptyDirExistsImplThatThrows(container_client, location.path); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, operation_name, " failed to ensure empty directory marker '", location.path, "' exists in container: ", container_client.GetUrl()); @@ -2396,7 +2393,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { return PathNotFound(location); } @@ -2492,7 +2489,7 @@ class AzureFileSystem::Impl { if (!deferred_responses.empty()) { container_client.SubmitBatch(batch); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", location.path, ": ", container_client.GetUrl()); } @@ -2502,7 +2499,7 @@ class AzureFileSystem::Impl { try { auto delete_result = deferred_response.GetResponse(); success = delete_result.Value.Deleted; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { success = false; } if (!success) { @@ -2521,7 +2518,7 @@ class AzureFileSystem::Impl { } } return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to list blobs in a directory: ", location.path, ": ", container_client.GetUrl()); @@ -2557,7 +2554,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.ErrorCode == "FilesystemNotFound" || exception.ErrorCode == "PathNotFound") { if (require_dir_to_exist) { @@ -2584,7 +2581,7 @@ class AzureFileSystem::Impl { auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); try { sub_directory_client.DeleteRecursive(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "Failed to delete a sub directory: ", location.container, kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); @@ -2596,7 +2593,7 @@ class AzureFileSystem::Impl { auto sub_file_client = adlfs_client.GetFileClient(path.Name); try { sub_file_client.Delete(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "Failed to delete a sub file: ", location.container, kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); @@ -2605,7 +2602,7 @@ class AzureFileSystem::Impl { } } return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { return Status::OK(); } @@ -2634,7 +2631,7 @@ class AzureFileSystem::Impl { try { [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { if (allow_missing_container) { return nullptr; @@ -2674,7 +2671,7 @@ class AzureFileSystem::Impl { try { [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { if (allow_missing) { return nullptr; @@ -2749,7 +2746,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { // ErrorCode can be "FilesystemNotFound", "PathNotFound"... if (require_file_to_exist) { @@ -2841,7 +2838,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return check_if_location_exists_as_dir(); } @@ -2906,7 +2903,7 @@ class AzureFileSystem::Impl { if (!dest_is_empty) { return NotEmpty(dest); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to check that '", dest.container, "' is empty: ", dest_container_client.GetUrl()); } @@ -2936,6 +2933,10 @@ class AzureFileSystem::Impl { return ExceptionToStatus(exception, "Failed to rename container '", src.container, "' to '", dest.container, "': ", blob_service_client_->GetUrl()); + } catch (const Core::RequestFailedException& exception) { + return ExceptionToStatus(exception, "Failed to rename container '", src.container, + "' to '", dest.container, + "': ", blob_service_client_->GetUrl()); } } else if (dest_is_empty) { // Even if we deleted the empty dest.container, RenameBlobContainer() would still @@ -2972,11 +2973,11 @@ class AzureFileSystem::Impl { src_lease_guard.BreakBeforeDeletion(kTimeNeededForContainerDeletion); src_container_client.Delete(options); src_lease_guard.Forget(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to delete empty container: '", src.container, "': ", src_container_client.GetUrl()); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Unable to replace empty container: '", dest.all, "': ", dest_container_client.GetUrl()); } @@ -3117,7 +3118,7 @@ class AzureFileSystem::Impl { src_lease_guard.BreakBeforeDeletion(kTimeNeededForFileOrDirectoryRename); src_adlfs_client.RenameFile(src_path, dest_path, options); src_lease_guard.Forget(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { // https://learn.microsoft.com/en-gb/rest/api/storageservices/datalakestoragegen2/path/create if (exception.StatusCode == Http::HttpStatusCode::NotFound) { if (exception.ErrorCode == "PathNotFound") { From 288de2f8a4a53708a94439910a516a7f4098960d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 21 Jan 2026 09:21:31 +0100 Subject: [PATCH 14/36] GH-48858: [C++][Parquet] Avoid re-serializing footer for signature verification (#48859) ### Rationale for this change When reading an encrypted Parquet file with a plaintext footer, the Parquet reader is able to verify footer integrity by comparing the signature in the file with the one computed by encrypting the footer. However, the way it does this is to first re-serializes the deserialized footer using Thrift. This has several issues: 1. it's inefficient 2. it's not obvious that it will always produce the same Thrift encoding as the original, leading to spurious signature verification failures 3. if the original footer deserializes to invalid enum values, attempting to serialize it again will lead to undefined behavior Reason 3 is what allowed this to be uncovered by OSS-Fuzz (see https://oss-fuzz.com/testcase-detail/4740205688193024). This PR switches to reusing the original serialized metadata. ### Are these changes tested? Yes, by existing tests and new fuzz regression file. ### Are there any user-facing changes? No. * GitHub Issue: #48858 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/file_reader.cc | 124 +++++++++++++++------------------ cpp/src/parquet/metadata.cc | 36 ++++++++++ cpp/src/parquet/metadata.h | 10 ++- 3 files changed, 99 insertions(+), 71 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b246feaf732..af7ccfd7ad7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow; namespace parquet { +using ::arrow::Future; +using ::arrow::Result; +using ::arrow::Status; + namespace { bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { // Check the encoding_stats to see if all data pages are dictionary encoded. @@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); } - ::arrow::Result> GetReadRanges( + Result> GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit, int64_t range_size_limit) { std::vector<::arrow::io::ReadRange> ranges; @@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents { range_size_limit); } - ::arrow::Future<> WhenBuffered(const std::vector& row_groups, - const std::vector& column_indices) const { + Future<> WhenBuffered(const std::vector& row_groups, + const std::vector& column_indices) const { if (!cached_source_) { - return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered"); + return Status::Invalid("Must call PreBuffer before WhenBuffered"); } std::vector<::arrow::io::ReadRange> ranges; for (int row : row_groups) { @@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents { // Fall through } - const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( - metadata_buffer, metadata_len, std::move(file_decryptor)); - auto file_decryption_properties = properties_.file_decryption_properties(); - if (is_encrypted_footer) { - // Nothing else to do here. - return; - } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. - if (file_decryption_properties != nullptr) { - if (!file_decryption_properties->plaintext_files_allowed()) { - throw ParquetException("Applying decryption properties on plaintext file"); - } - } - } else { - // Encrypted file with plaintext footer mode. - ParseMetaDataOfEncryptedFileWithPlaintextFooter( - file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); - } + ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer, + std::move(file_decryptor)); } // Validate the source size and get the initial read size. @@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents { } // Does not throw. - ::arrow::Future<> ParseMetaDataAsync() { + Future<> ParseMetaDataAsync() { int64_t footer_read_size; BEGIN_PARQUET_CATCH_EXCEPTIONS footer_read_size = GetFooterReadSize(); END_PARQUET_CATCH_EXCEPTIONS // Assumes this is kept alive externally return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size) - .Then([this, - footer_read_size](const std::shared_ptr<::arrow::Buffer>& footer_buffer) - -> ::arrow::Future<> { + .Then([this, footer_read_size]( + const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> Future<> { uint32_t metadata_len; BEGIN_PARQUET_CATCH_EXCEPTIONS metadata_len = ParseFooterLength(footer_buffer, footer_read_size); @@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents { } // Continuation - ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync( + Future<> ParseMaybeEncryptedMetaDataAsync( std::shared_ptr<::arrow::Buffer> footer_buffer, std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size, uint32_t metadata_len) { @@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor = std::move(file_decryptor)]( const std::shared_ptr<::arrow::Buffer>& metadata_buffer) { // Continue and read the file footer - return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, - file_decryptor); + BEGIN_PARQUET_CATCH_EXCEPTIONS + ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, + file_decryptor); + END_PARQUET_CATCH_EXCEPTIONS + return Status::OK(); }); } - return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, - is_encrypted_footer, std::move(file_decryptor)); + BEGIN_PARQUET_CATCH_EXCEPTIONS + ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer, + std::move(file_decryptor)); + END_PARQUET_CATCH_EXCEPTIONS + return Status::OK(); } // Continuation - ::arrow::Status ParseMetaDataFinal( - std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len, - const bool is_encrypted_footer, - std::shared_ptr file_decryptor) { - BEGIN_PARQUET_CATCH_EXCEPTIONS + void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer, + uint32_t metadata_len, const bool is_encrypted_footer, + std::shared_ptr file_decryptor) { const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties(); if (is_encrypted_footer) { // Nothing else to do here. - return ::arrow::Status::OK(); + return; } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. if (file_decryption_properties != nullptr) { if (!file_decryption_properties->plaintext_files_allowed()) { @@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents { ParseMetaDataOfEncryptedFileWithPlaintextFooter( file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); } - END_PARQUET_CATCH_EXCEPTIONS - return ::arrow::Status::OK(); } private: @@ -707,20 +697,16 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( auto file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); - // set the InternalFileDecryptor in the metadata as well, as it's used - // for signature verification and for ColumnChunkMetaData creation. - file_metadata_->set_file_decryptor(std::move(file_decryptor)); + // Set the InternalFileDecryptor in the metadata as well, as it's used + // for ColumnChunkMetaData creation. + file_metadata_->set_file_decryptor(file_decryptor); if (file_decryption_properties->check_plaintext_footer_integrity()) { - if (metadata_len - read_metadata_len != - (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) { - throw ParquetInvalidOrCorruptedFileException( - "Failed reading metadata for encryption signature (requested ", - parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength, - " bytes but have ", metadata_len - read_metadata_len, " bytes)"); - } - - if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) { + auto serialized_metadata = + metadata_buffer->span_as().subspan(0, read_metadata_len); + auto signature = metadata_buffer->span_as().subspan(read_metadata_len); + if (!FileMetaData::VerifySignature(serialized_metadata, signature, + file_decryptor.get())) { throw ParquetInvalidOrCorruptedFileException( "Parquet crypto signature verification failed"); } @@ -804,7 +790,7 @@ std::unique_ptr ParquetFileReader::Contents::Open( return result; } -::arrow::Future> +Future> ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, const ReaderProperties& props, std::shared_ptr metadata) { @@ -815,7 +801,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, if (metadata == nullptr) { // TODO(ARROW-12259): workaround since we have Future<(move-only type)> struct { - ::arrow::Result> operator()() { + Result> operator()() { return std::move(result); } @@ -825,7 +811,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, return file->ParseMetaDataAsync().Then(std::move(Continuation)); } else { file->set_metadata(std::move(metadata)); - return ::arrow::Future>::MakeFinished( + return Future>::MakeFinished( std::move(result)); } END_PARQUET_CATCH_EXCEPTIONS @@ -855,24 +841,24 @@ std::unique_ptr ParquetFileReader::OpenFile( return Open(std::move(source), props, std::move(metadata)); } -::arrow::Future> ParquetFileReader::OpenAsync( +Future> ParquetFileReader::OpenAsync( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props, std::shared_ptr metadata) { BEGIN_PARQUET_CATCH_EXCEPTIONS auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata)); // TODO(ARROW-12259): workaround since we have Future<(move-only type)> - auto completed = ::arrow::Future>::Make(); - fut.AddCallback([fut, completed]( - const ::arrow::Result>& - contents) mutable { - if (!contents.ok()) { - completed.MarkFinished(contents.status()); - return; - } - std::unique_ptr result = std::make_unique(); - result->Open(fut.MoveResult().MoveValueUnsafe()); - completed.MarkFinished(std::move(result)); - }); + auto completed = Future>::Make(); + fut.AddCallback( + [fut, completed]( + const Result>& contents) mutable { + if (!contents.ok()) { + completed.MarkFinished(contents.status()); + return; + } + std::unique_ptr result = std::make_unique(); + result->Open(fut.MoveResult().MoveValueUnsafe()); + completed.MarkFinished(std::move(result)); + }); return completed; END_PARQUET_CATCH_EXCEPTIONS } @@ -919,7 +905,7 @@ void ParquetFileReader::PreBuffer(const std::vector& row_groups, file->PreBuffer(row_groups, column_indices, ctx, options); } -::arrow::Result> ParquetFileReader::GetReadRanges( +Result> ParquetFileReader::GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit, int64_t range_size_limit) { // Access private methods here @@ -929,8 +915,8 @@ ::arrow::Result> ParquetFileReader::GetReadR range_size_limit); } -::arrow::Future<> ParquetFileReader::WhenBuffered( - const std::vector& row_groups, const std::vector& column_indices) const { +Future<> ParquetFileReader::WhenBuffered(const std::vector& row_groups, + const std::vector& column_indices) const { // Access private methods here SerializedFile* file = ::arrow::internal::checked_cast(contents_.get()); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 42dd8e52ee9..03a8a4c4604 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1169,6 +1169,42 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, return impl_->WriteTo(dst, encryptor); } +bool FileMetaData::VerifySignature(std::span serialized_metadata, + std::span signature, + InternalFileDecryptor* file_decryptor) { + DCHECK_NE(file_decryptor, nullptr); + + // In plaintext footer, the "signature" is the concatenation of the nonce used + // for GCM encryption, and the authentication tag obtained after GCM encryption. + if (signature.size() != encryption::kGcmTagLength + encryption::kNonceLength) { + throw ParquetInvalidOrCorruptedFileException( + "Invalid footer encryption signature (expected ", + encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ", + signature.size(), ")"); + } + + // Encrypt plaintext serialized metadata so as to compute its signature + auto nonce = signature.subspan(0, encryption::kNonceLength); + auto tag = signature.subspan(encryption::kNonceLength); + const SecureString& key = file_decryptor->GetFooterKey(); + const std::string& aad = encryption::CreateFooterAad(file_decryptor->file_aad()); + + auto aes_encryptor = encryption::AesEncryptor::Make( + file_decryptor->algorithm(), static_cast(key.size()), /*metadata=*/true, + /*write_length=*/false); + + std::shared_ptr encrypted_buffer = + AllocateBuffer(file_decryptor->pool(), + aes_encryptor->CiphertextLength(serialized_metadata.size())); + int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt( + serialized_metadata, key.as_span(), str2span(aad), nonce, + encrypted_buffer->mutable_span_as()); + DCHECK_EQ(encrypted_len, encrypted_buffer->size()); + // Check computed signature against expected + return 0 == memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength, + tag.data(), encryption::kGcmTagLength); +} + class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() = default; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 3380adbf56a..1235aae9ad7 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -331,8 +332,8 @@ class PARQUET_EXPORT FileMetaData { EncryptionAlgorithm encryption_algorithm() const; const std::string& footer_signing_key_metadata() const; - /// \brief Verify signature of FileMetaData when file is encrypted but footer - /// is not encrypted (plaintext footer). + PARQUET_DEPRECATED( + "Deprecated in 24.0.0. If you need this functionality, please report an issue.") bool VerifySignature(const void* signature); void WriteTo(::arrow::io::OutputStream* dst, @@ -392,6 +393,11 @@ class PARQUET_EXPORT FileMetaData { void set_file_decryptor(std::shared_ptr file_decryptor); const std::shared_ptr& file_decryptor() const; + // Verify the signature of a plaintext footer. + static bool VerifySignature(std::span serialized_metadata, + std::span signature, + InternalFileDecryptor* file_decryptor); + // PIMPL Idiom FileMetaData(); class FileMetaDataImpl; From 9b8dc05fa68d3bd53e8ceb9ef14a2382216cbedf Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 21 Jan 2026 10:16:51 +0100 Subject: [PATCH 15/36] GH-48900: [C++] Avoid memory blowup with excessive variadic buffer count in IPC (#48901) ### Rationale for this change An incorrect variadic buffer count could easily blow up memory when reserving a vector of Buffers, even though the RecordBatch has a lot less buffers available. Reported by OSS-Fuzz at https://issues.oss-fuzz.com/issues/476180608, and separately by Silas Boch. ### What changes are included in this PR? Pre-validate the variadic buffer count read from the IPC RecordBatch table. Initial patch by Silas Boch. ### Are these changes tested? Yes, by additional fuzz regression file. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #48900 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/reader.cc | 18 ++++++++++++------ testing | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6a20dbb8c85..8e125fc5ede 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -250,18 +250,24 @@ class ArrayLoader { } } - Result GetVariadicCount(int i) { + Result GetVariadicCount(int i) { auto* variadic_counts = metadata_->variadicBufferCounts(); CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); if (i >= static_cast(variadic_counts->size())) { return Status::IOError("variadic_count_index out of range."); } int64_t count = variadic_counts->Get(i); - if (count < 0 || count > std::numeric_limits::max()) { - return Status::IOError( - "variadic_count must be representable as a positive int32_t, got ", count, "."); + if (count < 0) { + return Status::IOError("variadic buffer count must be positive"); + } + // Detect an excessive variadic buffer count to avoid potential memory blowup + // (GH-48900). + const auto max_buffer_count = + static_cast(metadata_->buffers()->size()) - buffer_index_; + if (count > max_buffer_count) { + return Status::IOError("variadic buffer count exceeds available number of buffers"); } - return static_cast(count); + return count; } Status GetFieldMetadata(int field_index, ArrayData* out) { @@ -398,7 +404,7 @@ class ArrayLoader { ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, GetVariadicCount(variadic_count_index_++)); out_->buffers.resize(data_buffer_count + 2); - for (size_t i = 0; i < data_buffer_count; ++i) { + for (int64_t i = 0; i < data_buffer_count; ++i) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); } return Status::OK(); diff --git a/testing b/testing index 19dda67f485..7b641152dcb 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 19dda67f485ffb3ffa92f4c6fa083576ef052d58 +Subproject commit 7b641152dcb0f9e197ebe24a1986151849250959 From 4e2752c6b000e62f3e03c6d150614bd0e8017ee4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 23 Jan 2026 11:51:35 +0900 Subject: [PATCH 16/36] GH-48885: [C++] Add missing curl dependency of `Arrow::arrow_static` CMake target (#48891) ### Rationale for this change We changed `macro(build_google_cloud_cpp_storage)` to `function(...)` in GH-48333 . So `find_curl()` doesn't change `ARROW_SYSTEM_DEPENDENCIES` in parent scope. (`function()` creates a new scope.) ### What changes are included in this PR? Move `find_curl()` to the top-level from in `function(build_google_cloud_cpp_storage)`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48885 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 ++-- dev/release/verify-apt.sh | 18 +++++++--- dev/release/verify-yum.sh | 38 +++++++++++++++------ 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index b95d6491457..e011375f37a 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -3366,10 +3366,6 @@ function(build_google_cloud_cpp_storage) # List of dependencies taken from https://github.com/googleapis/google-cloud-cpp/blob/main/doc/packaging.md build_crc32c_once() - # Curl is required on all platforms, but building it internally might also trip over S3's copy. - # For now, force its inclusion from the underlying system or fail. - find_curl() - fetchcontent_declare(google_cloud_cpp ${FC_DECLARE_COMMON_OPTIONS} URL ${google_cloud_cpp_storage_SOURCE_URL} @@ -3453,6 +3449,9 @@ if(ARROW_WITH_GOOGLE_CLOUD_CPP) ) endif() + # curl is required on all platforms. We always use system curl to + # avoid conflict. + find_curl() resolve_dependency(google_cloud_cpp_storage PC_PACKAGE_NAMES google_cloud_cpp_storage) get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage INTERFACE_INCLUDE_DIRECTORIES) diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index 73a0b156075..ea8be3da15e 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -162,11 +162,19 @@ if [ "${cmake_version_major}" -gt "3" ] || \ [ "${cmake_version_major}" -eq "3" -a "${cmake_version_minor}" -ge "25" ]; then cp -a "${TOP_SOURCE_DIR}/cpp/examples/minimal_build" build/ pushd build/minimal_build - cmake . - make -j$(nproc) - ./arrow-example - c++ -o arrow-example example.cc $(pkg-config --cflags --libs arrow) -std=c++20 - ./arrow-example + cmake -S . -B build_shared + make -C build_shared -j$(nproc) + build_shared/arrow-example + cmake -S . -B build_static -DARROW_LINK_SHARED=OFF + make -C build_static -j$(nproc) + build_static/arrow-example + mkdir -p build_pkg_config + c++ \ + example.cc \ + -o build_pkg_config/arrow-example \ + $(pkg-config --cflags --libs arrow) \ + -std=c++20 + build_pkg_config/arrow-example popd fi echo "::endgroup::" diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh index d642f806295..684b2166934 100755 --- a/dev/release/verify-yum.sh +++ b/dev/release/verify-yum.sh @@ -44,7 +44,7 @@ repository_version="${distribution_version}" cmake_package=cmake cmake_command=cmake -devtoolset= +gcc_toolset= scl_package= have_arrow_libs=no have_flight=yes @@ -65,11 +65,17 @@ echo "::group::Prepare repository" case "${distribution}-${distribution_version}" in almalinux-8) distribution_prefix="almalinux" + gcc_toolset=14 have_arrow_libs=yes ruby_devel_packages+=(redhat-rpm-config) install_command="dnf install -y --enablerepo=powertools" info_command="dnf info --enablerepo=powertools" ;; + almalinux-9) + distribution_prefix="almalinux" + gcc_toolset=12 + ruby_devel_packages+=(redhat-rpm-config) + ;; almalinux-*) distribution_prefix="almalinux" ruby_devel_packages+=(redhat-rpm-config) @@ -169,11 +175,11 @@ ${install_command} \ git \ libarchive \ pkg-config -if [ -n "${devtoolset}" ]; then +if [ -n "${gcc_toolset}" ]; then ${install_command} \ - devtoolset-${devtoolset}-gcc-c++ \ - devtoolset-${devtoolset}-make - . /opt/rh/devtoolset-${devtoolset}/enable + gcc-toolset-${gcc_toolset} \ + make + . /opt/rh/gcc-toolset-${gcc_toolset}/enable else ${install_command} \ gcc-c++ \ @@ -191,13 +197,25 @@ if [ "${cmake_version_major}" -gt "3" ] || \ [ "${cmake_version_major}" -eq "3" -a "${cmake_version_minor}" -ge "25" ]; then cp -a "${TOP_SOURCE_DIR}/cpp/examples/minimal_build" build/ pushd build/minimal_build - ${cmake_command} . - make -j$(nproc) - ./arrow-example - c++ -o arrow-example example.cc $(pkg-config --cflags --libs arrow) -std=c++2a - ./arrow-example + cmake -S . -B build_shared + make -C build_shared -j$(nproc) + build_shared/arrow-example + cmake -S . -B build_static -DARROW_LINK_SHARED=OFF + make -C build_static -j$(nproc) + build_static/arrow-example + mkdir -p build_pkg_config + c++ \ + example.cc \ + -o build_pkg_config/arrow-example \ + $(pkg-config --cflags --libs arrow) \ + -std=c++2a + build_pkg_config/arrow-example popd fi +if [ -n "${gcc_toolset}" ]; then + dnf remove -y "gcc-toolset-${gcc_toolset}-*" + ${install_command} gcc-c++ +fi echo "::endgroup::" if [ "${have_glib}" = "yes" ]; then From d9d72a90a985d8765784fee03a2984ecfa430674 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Sat, 24 Jan 2026 23:45:30 -0800 Subject: [PATCH 17/36] GH-48160: [C++][Gandiva] Pass CPU attributes to LLVM (#48161) ### Rationale for this change The CPU attributes are not passed to the LLVM layer, which means potential optimizations could be missed leading to inefficient code. This feature was lost as part of the refactoring in https://github.com/apache/arrow/commit/83cba25017a5c3a03e47f1851f242fa284f93533 I also discovered a bug with decimal alignments that was exposed by this change and was only reproducible in our test environment. ### What changes are included in this PR? Pass the CPU attributes to the LLVM code generation, and a unit test. Fix the 16 bit vs 8 bit decimal alignment problem. This was causing a crash sometimes on certain architectures with certain queries. Added a unit test. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #48160 Lead-authored-by: Logan Riggs Co-authored-by: lriggs Signed-off-by: Sutou Kouhei --- cpp/src/gandiva/CMakeLists.txt | 1 + cpp/src/gandiva/engine.cc | 1 + cpp/src/gandiva/llvm_generator.cc | 16 +- cpp/src/gandiva/target_datalayout_test.cc | 39 +++ cpp/src/gandiva/tests/CMakeLists.txt | 1 + .../gandiva/tests/decimal_alignment_test.cc | 252 ++++++++++++++++++ 6 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 cpp/src/gandiva/target_datalayout_test.cc create mode 100644 cpp/src/gandiva/tests/decimal_alignment_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index e5760243b39..31a86d5da9d 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -274,6 +274,7 @@ add_gandiva_test(internals-test hash_utils_test.cc gdv_function_stubs_test.cc interval_holder_test.cc + target_datalayout_test.cc tests/test_util.cc EXTRA_LINK_LIBS re2::re2 diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 64ed433a686..a718a800605 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -330,6 +330,7 @@ Engine::Engine(const std::shared_ptr& conf, // LLVM 10 doesn't like the expr function name to be the same as the module name auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); module_ = std::make_unique(module_id, *context_); + module_->setDataLayout(target_machine_->createDataLayout()); } Engine::~Engine() {} diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 4e6480fa167..0f0918b3a1c 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -399,8 +399,13 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, if (output_type_id == arrow::Type::BOOL) { SetPackedBitValue(output_ref, loop_var, output_value->data()); - } else if (arrow::is_primitive(output_type_id) || - output_type_id == arrow::Type::DECIMAL) { + } else if (output_type_id == arrow::Type::DECIMAL) { + // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned. + // Use CreateAlignedStore with 8-byte alignment to match Arrow's actual alignment. + auto slot_offset = + builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); + builder->CreateAlignedStore(output_value->data(), slot_offset, llvm::MaybeAlign(8)); + } else if (arrow::is_primitive(output_type_id)) { auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); builder->CreateStore(output_value->data(), slot_offset); @@ -602,7 +607,12 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { case arrow::Type::DECIMAL: { auto slot_offset = builder->CreateGEP(types->i128_type(), slot_ref, slot_index); - slot_value = builder->CreateLoad(types->i128_type(), slot_offset, dex.FieldName()); + // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned. + // Using CreateLoad with default alignment (16 for i128) causes crashes on + // misaligned data. Use CreateAlignedLoad with 8-byte alignment to match Arrow's + // actual alignment. + slot_value = builder->CreateAlignedLoad( + types->i128_type(), slot_offset, llvm::MaybeAlign(8), false, dex.FieldName()); lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType()); break; } diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc new file mode 100644 index 00000000000..0b32c6caf96 --- /dev/null +++ b/cpp/src/gandiva/target_datalayout_test.cc @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gandiva/llvm_generator.h" +#include "gandiva/tests/test_util.h" + +namespace gandiva { + +// Test that verifies the target data layout string representation +// is populated. +TEST(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); + + llvm::Module* module = generator->module(); + ASSERT_NE(module, nullptr); + + const llvm::DataLayout& data_layout = module->getDataLayout(); + std::string data_layout_str = data_layout.getStringRepresentation(); + + ASSERT_FALSE(data_layout_str.empty()); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 68138f50d81..356b976e005 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -20,6 +20,7 @@ add_gandiva_test(projector-test binary_test.cc boolean_expr_test.cc date_time_test.cc + decimal_alignment_test.cc decimal_single_test.cc decimal_test.cc filter_project_test.cc diff --git a/cpp/src/gandiva/tests/decimal_alignment_test.cc b/cpp/src/gandiva/tests/decimal_alignment_test.cc new file mode 100644 index 00000000000..3028ec81f31 --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_alignment_test.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Test for decimal128 alignment issue fix. +// Arrow decimal128 data may be 8-byte aligned but not 16-byte aligned. +// This test verifies that Gandiva handles such data correctly. + +#include + +#include "arrow/array/array_decimal.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/decimal.h" + +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +class TestDecimalAlignment : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +// Create a decimal128 array with data at a specific alignment offset +// This simulates the real-world scenario where Arrow data from external sources +// (like JNI/Java) may not be 16-byte aligned. +std::shared_ptr MakeMisalignedDecimalArray( + const std::shared_ptr& type, + const std::vector& values, int alignment_offset) { + // Allocate buffer with extra space for misalignment + int64_t data_size = values.size() * 16; // 16 bytes per Decimal128 + int64_t buffer_size = data_size + 16; // Extra space for offset + + std::shared_ptr buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer)); + + // Calculate the starting offset to achieve desired alignment + // We want the data to be 8-byte aligned but NOT 16-byte aligned + uint8_t* raw_data = buffer->mutable_data(); + uintptr_t addr = reinterpret_cast(raw_data); + + // Find offset to get to 8-byte aligned but not 16-byte aligned address + int offset_to_8 = (8 - (addr % 8)) % 8; + int current_16_alignment = (addr + offset_to_8) % 16; + + int final_offset; + if (alignment_offset == 8) { + // Want 8-byte aligned but NOT 16-byte aligned + if (current_16_alignment == 0) { + final_offset = offset_to_8 + 8; // Add 8 to break 16-byte alignment + } else { + final_offset = offset_to_8; + } + } else { + // Want 16-byte aligned + final_offset = (16 - (addr % 16)) % 16; + } + + // Copy decimal values to the offset location + uint8_t* data_start = raw_data + final_offset; + for (size_t i = 0; i < values.size(); i++) { + memcpy(data_start + i * 16, values[i].ToBytes().data(), 16); + } + + // Verify alignment + uintptr_t data_addr = reinterpret_cast(data_start); + EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned"; + if (alignment_offset == 8) { + EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned"; + } + + // Create a sliced buffer starting at our offset + auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size); + + // Create validity buffer (all valid) + std::shared_ptr validity_buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer((values.size() + 7) / 8).Value(&validity_buffer)); + memset(validity_buffer->mutable_data(), 0xFF, validity_buffer->size()); + + // Create the array with our misaligned data buffer + auto array_data = arrow::ArrayData::Make(type, static_cast(values.size()), + {validity_buffer, sliced_buffer}); + + return std::make_shared(array_data); +} + +// Test that decimal operations work correctly with 8-byte aligned (but not 16-byte +// aligned) data +TEST_F(TestDecimalAlignment, TestMisalignedDecimalSubtract) { + constexpr int32_t precision = 38; + constexpr int32_t scale = 17; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = arrow::field("a", decimal_type); + auto field_b = arrow::field("b", decimal_type); + auto schema = arrow::schema({field_a, field_b}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type); + ASSERT_OK(status); + + auto res = arrow::field("res", output_type); + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto subtract = + TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type); + auto expr = TreeExprBuilder::MakeExpression(subtract, res); + + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + ASSERT_OK(status); + + // Create test data + std::vector values_a = {Decimal128(100), Decimal128(200), Decimal128(300)}; + std::vector values_b = {Decimal128(10), Decimal128(20), Decimal128(30)}; + + // Create arrays with 8-byte alignment (but NOT 16-byte aligned) + auto array_a = MakeMisalignedDecimalArray(decimal_type, values_a, 8); + auto array_b = MakeMisalignedDecimalArray(decimal_type, values_b, 8); + + auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b}); + + // This should NOT crash even with misaligned data + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + ASSERT_OK(status); + + // Verify results: 100-10=90, 200-20=180, 300-30=270 + auto result = std::dynamic_pointer_cast(outputs[0]); + ASSERT_NE(result, nullptr); + EXPECT_EQ(result->length(), 3); +} + +// Create a misaligned output buffer for decimal128 +std::shared_ptr MakeMisalignedDecimalOutput( + const std::shared_ptr& type, int64_t num_records, + int alignment_offset) { + // Allocate data buffer with extra space for misalignment + int64_t data_size = num_records * 16; // 16 bytes per Decimal128 + int64_t buffer_size = data_size + 16; // Extra space for offset + + std::shared_ptr buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer)); + + uint8_t* raw_data = const_cast(buffer->data()); + uintptr_t addr = reinterpret_cast(raw_data); + + // Find offset to get to 8-byte aligned but not 16-byte aligned address + int offset_to_8 = (8 - (addr % 8)) % 8; + int current_16_alignment = (addr + offset_to_8) % 16; + + int final_offset; + if (alignment_offset == 8) { + if (current_16_alignment == 0) { + final_offset = offset_to_8 + 8; + } else { + final_offset = offset_to_8; + } + } else { + final_offset = (16 - (addr % 16)) % 16; + } + + // Verify alignment + uintptr_t data_addr = reinterpret_cast(raw_data + final_offset); + EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned"; + if (alignment_offset == 8) { + EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned"; + } + + auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size); + + // Create validity buffer + int64_t bitmap_size = (num_records + 7) / 8; + std::shared_ptr validity_buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(bitmap_size).Value(&validity_buffer)); + memset(const_cast(validity_buffer->data()), 0xFF, validity_buffer->size()); + + return arrow::ArrayData::Make(type, num_records, {validity_buffer, sliced_buffer}); +} + +// Test that decimal STORES work correctly with 8-byte aligned (but not 16-byte aligned) +// output +TEST_F(TestDecimalAlignment, TestMisalignedDecimalStore) { + constexpr int32_t precision = 38; + constexpr int32_t scale = 17; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = arrow::field("a", decimal_type); + auto field_b = arrow::field("b", decimal_type); + auto schema = arrow::schema({field_a, field_b}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type); + ASSERT_OK(status); + + auto res = arrow::field("res", output_type); + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto subtract = + TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type); + auto expr = TreeExprBuilder::MakeExpression(subtract, res); + + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + ASSERT_OK(status); + + // Create ALIGNED input arrays (using standard Arrow allocation) + auto array_a = MakeArrowArrayDecimal( + decimal_type, {Decimal128(100), Decimal128(200), Decimal128(300)}, + {true, true, true}); + auto array_b = MakeArrowArrayDecimal( + decimal_type, {Decimal128(10), Decimal128(20), Decimal128(30)}, {true, true, true}); + + auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b}); + + // Create MISALIGNED output buffer (8-byte aligned but NOT 16-byte aligned) + auto output_data = MakeMisalignedDecimalOutput(output_type, 3, 8); + + // This should NOT crash even with misaligned output buffer + status = projector->Evaluate(*in_batch, {output_data}); + ASSERT_OK(status); + + // Verify the output was written correctly + auto result = std::make_shared(output_data); + EXPECT_EQ(result->length(), 3); +} + +} // namespace gandiva From 8dd6c8ef60e665194d6e21139007de7637fa6958 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sun, 25 Jan 2026 09:42:16 -0600 Subject: [PATCH 18/36] GH-48973: [R][C++] Fix RE2 compilation errors under C++20 (#48976) ### Rationale for this change Fix a building RE2 with C++20 ### What changes are included in this PR? The fix, a test ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #48973 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- ci/docker/fedora-42-r-clang.dockerfile | 224 ++++++++++++++++++++ compose.yaml | 32 +++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 + dev/tasks/tasks.yml | 9 + 4 files changed, 272 insertions(+) create mode 100644 ci/docker/fedora-42-r-clang.dockerfile diff --git a/ci/docker/fedora-42-r-clang.dockerfile b/ci/docker/fedora-42-r-clang.dockerfile new file mode 100644 index 00000000000..9bc970e0609 --- /dev/null +++ b/ci/docker/fedora-42-r-clang.dockerfile @@ -0,0 +1,224 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Fedora 42 container with Clang and R-devel for testing Arrow R package +# Replicates CRAN's r-devel-linux-x86_64-fedora-clang environment +# See: https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang + +ARG arch=amd64 +FROM ${arch}/fedora:42 + +# Install build dependencies +RUN dnf update -y && \ + dnf install -y \ + # Build tools + autoconf \ + automake \ + bzip2 \ + bzip2-devel \ + cmake \ + curl \ + curl-devel \ + diffutils \ + gcc \ + gcc-c++ \ + gcc-gfortran \ + git \ + java-latest-openjdk-devel \ + libicu-devel \ + libtool \ + libuuid-devel \ + libxcrypt-devel \ + lld \ + make \ + ninja-build \ + openssl-devel \ + patch \ + pcre2-devel \ + perl \ + pkgconfig \ + python3 \ + python3-pip \ + readline-devel \ + rsync \ + subversion \ + tar \ + texinfo \ + texlive-collection-basic \ + texlive-collection-latex \ + texlive-collection-latexrecommended \ + texlive-collection-fontsrecommended \ + texlive-inconsolata \ + texlive-parskip \ + texlive-natbib \ + texlive-fancyvrb \ + texlive-framed \ + unzip \ + wget \ + which \ + xz \ + xz-devel \ + zlib-devel \ + # X11 libraries for R + cairo-devel \ + libX11-devel \ + libXmu-devel \ + libXt-devel \ + libcurl-devel \ + libjpeg-turbo-devel \ + libpng-devel \ + libtiff-devel \ + pango-devel \ + tk-devel \ + # Additional R dependencies + libxml2-devel \ + fontconfig-devel \ + freetype-devel \ + fribidi-devel \ + harfbuzz-devel && \ + dnf clean all + +# Install LLVM/Clang from Fedora repos (will be the latest available in Fedora 42) +# Note: CRAN uses Clang 21, but we use whatever is available in Fedora repos +# This should be close enough for testing purposes +RUN dnf install -y \ + clang \ + clang-devel \ + clang-tools-extra \ + compiler-rt \ + flang \ + lld \ + llvm \ + llvm-devel \ + libcxx \ + libcxx-devel \ + libcxxabi \ + libcxxabi-devel \ + libomp \ + libomp-devel && \ + dnf clean all + +# Install locale support +RUN dnf install -y glibc-langpack-en && dnf clean all + +# Set up compiler environment to match CRAN's Fedora Clang configuration +# CRAN uses: -O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3 +# CRAN's clang is built to use libc++ by default; Fedora's defaults to libstdc++, +# so we must add -stdlib=libc++ explicitly +ENV CC=clang \ + CXX="clang++ -stdlib=libc++" \ + FC=flang-new \ + CFLAGS="-O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" \ + CXXFLAGS="-O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" \ + FFLAGS="-O2 -pedantic" \ + LDFLAGS="-fuse-ld=lld" + +# Set locale (glibc-langpack-en must be installed first) +ENV LANG=en_US.UTF-8 \ + LC_ALL=en_US.UTF-8 \ + LC_COLLATE=C \ + TZ=UTC + +# Build R-devel from source to match CRAN's R-devel +ARG r_version=devel +RUN cd /tmp && \ + if [ "$r_version" = "devel" ]; then \ + svn checkout https://svn.r-project.org/R/trunk R-devel && \ + cd R-devel/tools && \ + ./rsync-recommended; \ + else \ + wget -q https://cran.r-project.org/src/base/R-4/R-${r_version}.tar.gz && \ + tar xf R-${r_version}.tar.gz && \ + mv R-${r_version} R-devel; \ + fi && \ + cd /tmp/R-devel && \ + ./configure \ + --prefix=/usr/local \ + --enable-R-shlib \ + --enable-memory-profiling \ + --with-blas \ + --with-lapack \ + --with-x \ + --with-tcltk \ + CC="clang" \ + CXX="clang++ -stdlib=libc++" \ + FC="flang-new" \ + CFLAGS="-O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" \ + CXXFLAGS="-O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" \ + FFLAGS="-O2 -pedantic" \ + LDFLAGS="-fuse-ld=lld" && \ + make -j$(nproc) && \ + make install && \ + cd / && \ + rm -rf /tmp/R-devel + +# Verify R installation and clang +RUN R --version && clang --version + +# Set CRAN repo +RUN echo 'options(repos = c(CRAN = "https://cran.rstudio.com"))' >> $(R RHOME)/etc/Rprofile.site + +# Install pak for package management +RUN R -q -e 'install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/%s/%s/%s/%s", "devel", .Platform$pkgType, R.Version()$os, R.Version()$arch))' + +# Enable automatic system requirements installation +ENV PKG_SYSREQS=true \ + R_PKG_SYSREQS2=true + +# Set up parallel compilation +RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site + +# Configure R to use clang for package compilation (matching CRAN's Makevars) +# Fedora's clang defaults to libstdc++, so we must specify -stdlib=libc++ +RUN mkdir -p /root/.R && \ + echo "CC = clang" >> /root/.R/Makevars && \ + echo "CXX = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX11 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX14 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX17 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX20 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "FC = flang-new" >> /root/.R/Makevars && \ + echo "CFLAGS = -O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXXFLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX11FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX14FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX17FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX20FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "FFLAGS = -O2 -pedantic" >> /root/.R/Makevars && \ + echo "LDFLAGS = -fuse-ld=lld" >> /root/.R/Makevars + +# Configure image and install Arrow-specific tooling +COPY ci/scripts/r_docker_configure.sh /arrow/ci/scripts/ +COPY ci/etc/rprofile /arrow/ci/etc/ +COPY ci/scripts/r_install_system_dependencies.sh /arrow/ci/scripts/ +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/r_docker_configure.sh + +# Install sccache +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +# Install R package dependencies +COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ +COPY r/DESCRIPTION /arrow/r/ +RUN /arrow/ci/scripts/r_deps.sh /arrow + +# Verify setup +RUN R --version && \ + clang --version && \ + R -e "sessionInfo()" diff --git a/compose.yaml b/compose.yaml index 84481e1af76..13f446ff030 100644 --- a/compose.yaml +++ b/compose.yaml @@ -138,6 +138,7 @@ x-hierarchy: - debian-docs - fedora-cpp: - fedora-python + - fedora-r-clang - python-sdist - ubuntu-cpp: - ubuntu-cpp-static @@ -1790,6 +1791,37 @@ services: - .:/arrow:delegated command: /arrow/ci/scripts/r_test.sh /arrow + fedora-r-clang: + # Usage: + # docker compose build fedora-r-clang + # docker compose run fedora-r-clang + # Tests R package on Fedora with Clang, simulating CRAN's + # r-devel-linux-x86_64-fedora-clang environment. + # R-devel is built from source with Clang and uses CRAN's compiler flags. + # See: https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang + # Parameters: + # FEDORA: 42 + # ARCH: amd64 + image: ${REPO}:${ARCH}-fedora-${FEDORA}-r-clang + build: + context: . + dockerfile: ci/docker/fedora-${FEDORA}-r-clang.dockerfile + cache_from: + - ${REPO}:${ARCH}-fedora-${FEDORA}-r-clang + args: + arch: ${ARCH} + shm_size: *shm-size + environment: + <<: [*common, *sccache] + LIBARROW_BINARY: "false" + ARROW_SOURCE_HOME: "/arrow" + ARROW_R_DEV: ${ARROW_R_DEV} + ARROW_USE_PKG_CONFIG: "false" + SKIP_VIGNETTES: "true" + NOT_CRAN: "false" + volumes: *fedora-volumes + command: /arrow/ci/scripts/r_test.sh /arrow + ############################## Integration ################################## conda-integration: diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e011375f37a..df937cc14cb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2867,6 +2867,13 @@ function(build_re2) fetchcontent_makeavailable(re2) + # Suppress -Wnested-anon-types warnings from RE2's use of anonymous types + # in anonymous unions (a compiler extension). + # See: https://github.com/apache/arrow/issues/48973 + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(re2 PRIVATE -Wno-nested-anon-types) + endif() + if(CMAKE_VERSION VERSION_LESS 3.28) set_property(DIRECTORY ${re2_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) endif() diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 2667aa1fb5e..931b6da784d 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -658,6 +658,15 @@ tasks: params: image: alpine-linux-r + test-r-fedora-clang: + ci: github + template: docker-tests/github.linux.yml + params: + image: fedora-r-clang + # R-devel built from source with Clang, simulating CRAN's + # r-devel-linux-x86_64-fedora-clang environment + timeout: 180 # 3 hours - R-devel build from source takes time + test-r-macos-as-cran: ci: github template: r/github.macos.cran.yml From 51df780a6e3ad3606a916332c0567a23f841b82d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 26 Jan 2026 11:02:42 +0900 Subject: [PATCH 19/36] GH-48880: [Ruby] Fix a bug that Arrow::ExecutePlan nodes may be GC-ed (#48919) ### Rationale for this change We must mark all nodes in `Arrow::ExecutePlan` but only the first node is marked. ### What changes are included in this PR? Fix typos in variable name. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48880 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ruby/red-arrow/ext/arrow/arrow.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp index 404ec8996f2..0c582d07077 100644 --- a/ruby/red-arrow/ext/arrow/arrow.cpp +++ b/ruby/red-arrow/ext/arrow/arrow.cpp @@ -59,7 +59,7 @@ namespace red_arrow { { auto plan = GARROW_EXECUTE_PLAN(object); auto nodes = garrow_execute_plan_get_nodes(plan); - for (auto node = nodes; nodes; nodes = g_list_next(nodes)) { + for (auto node = nodes; node; node = g_list_next(node)) { rbgobj_gc_mark_instance(node->data); } } From 6e0387d5be6e082fda5d2a5a3f56223b4765cf44 Mon Sep 17 00:00:00 2001 From: "Alina (Xi) Li" <96995091+alinaliBQ@users.noreply.github.com> Date: Sun, 25 Jan 2026 18:11:52 -0800 Subject: [PATCH 20/36] GH-48637: [C++][FlightRPC] ODBC: Disable `absl` deadlock detection (#48747) ### Rationale for this change https://github.com/apache/arrow/issues/48637 Arrow Flight SQL ODBC gets `potential deadlock` error during tests because this error is happening at Arrow Flight SQL (see https://github.com/apache/arrow/issues/48714). Arrow doesn't use `absl::Mutex` directly, and `absl::Mutex` is used by upstream projects gRPC/Protobuf, so Arrow itself likely did not cause the potential deadlock. We can disable the deadlock detection for now. ### What changes are included in this PR? - Disable `absl` deadlock detection inside ODBC, so potential deadlock detection from upstream projects don't get picked up in the tests. ### Are these changes tested? - Tested locally on MSVC Windows ### Are there any user-facing changes? N/A * GitHub Issue: #48637 Authored-by: Alina (Xi) Li Signed-off-by: David Li --- cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc index 8b24762bfc3..c6a813cfd48 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.h" #include "arrow/compute/api.h" @@ -37,6 +39,8 @@ FlightSqlDriver::FlightSqlDriver() RegisterComputeKernels(); // Register log after compute kernels check to avoid segfaults RegisterLog(); + // GH-48637: Disable Absl Deadlock detection from upstream projects + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); } FlightSqlDriver::~FlightSqlDriver() { From daf5e960a699c5796bf59403f565ef3328a80172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 26 Jan 2026 11:08:45 +0100 Subject: [PATCH 21/36] GH-48965: [Python][C++] Compare unique_ptr for CFlightResult or CFlightInfo to nullptr instead of NULL (#48968) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Cython built code is currently failing to compile on free threaded wheels due to: ``` /arrow/python/build/temp.linux-x86_64-cpython-313t/_flight.cpp: In function ‘PyObject* __pyx_gb_7pyarrow_7_flight_12FlightClient_9do_action_2generator2(__pyx_CoroutineObject*, PyThreadState*, PyObject*)’: /arrow/python/build/temp.linux-x86_64-cpython-313t/_flight.cpp:43068:110: error: call of overloaded ‘unique_ptr(NULL)’ is ambiguous 43068 | __pyx_t_3 = (__pyx_cur_scope->__pyx_v_result->result == ((std::unique_ptr< arrow::flight::Result> )NULL)); | ``` ### What changes are included in this PR? Update comparing `unique_ptr[CFlightResult]` and `unique_ptr[CFlightInfo]` from `NULL` to `nullptr`. ### Are these changes tested? Yes via archery. ### Are there any user-facing changes? No * GitHub Issue: #48965 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- python/pyarrow/_flight.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index b7e7af260c2..f447129cf40 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -1666,7 +1666,7 @@ cdef class FlightClient(_Weakrefable): result = Result.__new__(Result) with nogil: check_flight_status(results.get().Next().Value(&result.result)) - if result.result == NULL: + if result.result == nullptr: break yield result return _do_action_response() @@ -1695,7 +1695,7 @@ cdef class FlightClient(_Weakrefable): result = FlightInfo.__new__(FlightInfo) with nogil: check_flight_status(listing.get().Next().Value(&result.info)) - if result.info == NULL: + if result.info == nullptr: break yield result From 1c6aae62c71574b95650cc4215fcc566198deaca Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 26 Jan 2026 14:29:00 +0100 Subject: [PATCH 22/36] GH-48924: [C++][CI] Fix pre-buffering issues in IPC file reader (#48925) ### What changes are included in this PR? Bug fixes and robustness improvements in the IPC file reader: * Fix bug reading variadic buffers with pre-buffering enabled * Fix bug reading dictionaries with pre-buffering enabled * Validate IPC buffer offsets and lengths Testing improvements: * Exercise pre-buffering in IPC tests * Actually exercise variadic buffers in IPC tests, by ensuring non-inline binary views are generated * Run fuzz targets on golden IPC integration files in ASAN/UBSAN CI job * Exercise pre-buffering in the IPC file fuzz target Miscellaneous: * Add convenience functions for integer overflow checking ### Are these changes tested? Yes, by existing and improved tests. ### Are there any user-facing changes? Bug fixes. **This PR contains a "Critical Fix".** Fixes a potential crash reading variadic buffers with pre-buffering enabled. * GitHub Issue: #48924 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- ci/scripts/cpp_test.sh | 9 + cpp/src/arrow/ipc/read_write_test.cc | 75 +++++---- cpp/src/arrow/ipc/reader.cc | 222 ++++++++++++++++--------- cpp/src/arrow/ipc/test_common.cc | 47 +++--- cpp/src/arrow/type.h | 10 ++ cpp/src/arrow/util/int_util_overflow.h | 33 ++++ cpp/src/arrow/util/int_util_test.cc | 18 ++ 7 files changed, 286 insertions(+), 128 deletions(-) diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0ad59bc308f..5d6d5e099ab 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -182,6 +182,15 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then # Some fuzz regression files may trigger huge memory allocations, # let the allocator return null instead of aborting. export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1" + export ARROW_FUZZING_VERBOSITY=1 + # Run golden IPC integration files: these should ideally load without errors, + # though some very old ones carry invalid data (such as decimal values + # larger than their advertised precision). + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-stream-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream") + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-file-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file") + # Run known crash files "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-* "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-* "${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-* diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 315d8bd07d9..9f7df541bd7 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1252,40 +1252,55 @@ struct FileGeneratorWriterHelper : public FileWriterHelper { Status ReadBatches(const IpcReadOptions& options, RecordBatchVector* out_batches, ReadStats* out_stats = nullptr, MetadataVector* out_metadata_list = nullptr) override { - std::shared_ptr buf_reader; - if (kCoalesce) { - // Use a non-zero-copy enabled BufferReader so we can test paths properly - buf_reader = std::make_shared(buffer_); - } else { - buf_reader = std::make_shared(buffer_); - } - AsyncGenerator> generator; + // The generator doesn't track stats. + EXPECT_EQ(nullptr, out_stats); - { - auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); - // Do NOT assert OK since some tests check whether this fails properly - EXPECT_FINISHES(fut); - ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); - EXPECT_EQ(num_batches_written_, reader->num_record_batches()); - // Generator will keep reader alive internally - ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); - } + auto read_batches = [&](bool pre_buffer) -> Result { + std::shared_ptr buf_reader; + if (kCoalesce) { + // Use a non-zero-copy enabled BufferReader so we can test paths properly + buf_reader = std::make_shared(buffer_); + } else { + buf_reader = std::make_shared(buffer_); + } + AsyncGenerator> generator; + + { + auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); + ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); + EXPECT_EQ(num_batches_written_, reader->num_record_batches()); + if (pre_buffer) { + RETURN_NOT_OK(reader->PreBufferMetadata(/*indices=*/{})); + } + // Generator will keep reader alive internally + ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); + } - // Generator is async-reentrant - std::vector>> futures; + // Generator is async-reentrant + std::vector>> futures; + for (int i = 0; i < num_batches_written_; ++i) { + futures.push_back(generator()); + } + auto fut = generator(); + ARROW_ASSIGN_OR_RAISE(auto final_batch, fut.result()); + EXPECT_EQ(nullptr, final_batch); + + RecordBatchVector batches; + for (auto& future : futures) { + ARROW_ASSIGN_OR_RAISE(auto batch, future.result()); + EXPECT_NE(nullptr, batch); + batches.push_back(batch); + } + return batches; + }; + + ARROW_ASSIGN_OR_RAISE(*out_batches, read_batches(/*pre_buffer=*/false)); + // Also read with pre-buffered metadata, and check the results are equal + ARROW_ASSIGN_OR_RAISE(auto batches_pre_buffered, read_batches(/*pre_buffer=*/true)); for (int i = 0; i < num_batches_written_; ++i) { - futures.push_back(generator()); - } - auto fut = generator(); - EXPECT_FINISHES_OK_AND_EQ(nullptr, fut); - for (auto& future : futures) { - EXPECT_FINISHES_OK_AND_ASSIGN(auto batch, future); - out_batches->push_back(batch); + AssertBatchesEqual(*batches_pre_buffered[i], *(*out_batches)[i], + /*check_metadata=*/true); } - - // The generator doesn't track stats. - EXPECT_EQ(nullptr, out_stats); - return Status::OK(); } }; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8e125fc5ede..f1571f76c24 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -54,6 +54,7 @@ #include "arrow/util/compression.h" #include "arrow/util/endian.h" #include "arrow/util/fuzz_internal.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/parallel.h" @@ -72,6 +73,7 @@ namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; +using internal::AddWithOverflow; using internal::checked_cast; using internal::checked_pointer_cast; @@ -177,14 +179,16 @@ class ArrayLoader { explicit ArrayLoader(const flatbuf::RecordBatch* metadata, MetadataVersion metadata_version, const IpcReadOptions& options, - int64_t file_offset) + int64_t file_offset, int64_t file_length) : metadata_(metadata), metadata_version_(metadata_version), file_(nullptr), file_offset_(file_offset), + file_length_(file_length), max_recursion_depth_(options.max_recursion_depth) {} Status ReadBuffer(int64_t offset, int64_t length, std::shared_ptr* out) { + // This construct permits overriding GetBuffer at compile time if (skip_io_) { return Status::OK(); } @@ -194,7 +198,10 @@ class ArrayLoader { if (length < 0) { return Status::Invalid("Negative length for reading buffer ", buffer_index_); } - // This construct permits overriding GetBuffer at compile time + auto read_end = AddWithOverflow({offset, length}); + if (!read_end.has_value() || (file_length_.has_value() && read_end > file_length_)) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } if (!bit_util::IsMultipleOf8(offset)) { return Status::Invalid("Buffer ", buffer_index_, " did not start on 8-byte aligned offset: ", offset); @@ -202,6 +209,9 @@ class ArrayLoader { if (file_) { return file_->ReadAt(offset, length).Value(out); } else { + if (!AddWithOverflow({read_end.value(), file_offset_}).has_value()) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } read_request_.RequestRange(offset + file_offset_, length, out); return Status::OK(); } @@ -292,6 +302,16 @@ class ArrayLoader { // we can skip that buffer without reading from shared memory RETURN_NOT_OK(GetFieldMetadata(field_index_++, out_)); + if (::arrow::internal::has_variadic_buffers(type_id)) { + ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, + GetVariadicCount(variadic_count_index_++)); + const int64_t start = static_cast(out_->buffers.size()); + // NOTE: this must be done before any other call to `GetBuffer` because + // BatchDataReadRequest will keep pointers to `std::shared_ptr` + // objects. + out_->buffers.resize(start + data_buffer_count); + } + if (internal::HasValidityBitmap(type_id, metadata_version_)) { // Extract null_bitmap which is common to all arrays except for unions // and nulls. @@ -300,6 +320,7 @@ class ArrayLoader { } buffer_index_++; } + return Status::OK(); } @@ -398,14 +419,9 @@ class ArrayLoader { Status Visit(const BinaryViewType& type) { out_->buffers.resize(2); - RETURN_NOT_OK(LoadCommon(type.id())); - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - - ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, - GetVariadicCount(variadic_count_index_++)); - out_->buffers.resize(data_buffer_count + 2); - for (int64_t i = 0; i < data_buffer_count; ++i) { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + RETURN_NOT_OK(LoadCommon(type.id())); // also initializes variadic buffers + for (int64_t i = 1; i < static_cast(out_->buffers.size()); ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i])); } return Status::OK(); } @@ -503,6 +519,7 @@ class ArrayLoader { const MetadataVersion metadata_version_; io::RandomAccessFile* file_; int64_t file_offset_; + std::optional file_length_; int max_recursion_depth_; int buffer_index_ = 0; int field_index_ = 0; @@ -1173,8 +1190,19 @@ namespace { // Common functions used in both the random-access file reader and the // asynchronous generator -inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { - return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()}; +Result FileBlockFromFlatbuffer(const flatbuf::Block* fb_block, + int64_t max_offset) { + auto block = + FileBlock{fb_block->offset(), fb_block->metaDataLength(), fb_block->bodyLength()}; + if (block.metadata_length < 0 || block.body_length < 0 || block.offset < 0) { + return Status::IOError("Invalid Block in IPC file footer"); + } + auto block_end = + AddWithOverflow({block.offset, block.metadata_length, block.body_length}); + if (!block_end.has_value() || block_end > max_offset) { + return Status::IOError("Invalid Block in IPC file footer"); + } + return block; } Status CheckAligned(const FileBlock& block) { @@ -1362,8 +1390,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { read_options, file, schema, &inclusion_mask); }; } - ARROW_ASSIGN_OR_RAISE(auto message, - ReadMessageFromBlock(GetRecordBatchBlock(i), fields_loader)); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(block, fields_loader)); CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); @@ -1379,8 +1407,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Result CountRows() override { int64_t total = 0; for (int i = 0; i < num_record_batches(); i++) { - ARROW_ASSIGN_OR_RAISE(auto outer_message, - ReadMessageFromBlock(GetRecordBatchBlock(i))); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto outer_message, ReadMessageFromBlock(block)); auto metadata = outer_message->metadata(); const flatbuf::Message* message = nullptr; RETURN_NOT_OK( @@ -1494,13 +1522,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status DoPreBufferMetadata(const std::vector& indices) { RETURN_NOT_OK(CacheMetadata(indices)); - EnsureDictionaryReadStarted(); + RETURN_NOT_OK(EnsureDictionaryReadStarted()); Future<> all_metadata_ready = WaitForMetadatas(indices); for (int index : indices) { Future> metadata_loaded = all_metadata_ready.Then([this, index]() -> Result> { stats_.num_messages.fetch_add(1, std::memory_order_relaxed); - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE( std::shared_ptr metadata, metadata_cache_->Read({block.offset, block.metadata_length})); @@ -1549,12 +1577,12 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } }; - FileBlock GetRecordBatchBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i)); + Result GetRecordBatchBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i), footer_offset_); } - FileBlock GetDictionaryBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i)); + Result GetDictionaryBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i), footer_offset_); } Result> ReadMessageFromBlock( @@ -1567,16 +1595,26 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status ReadDictionaries() { // Read all the dictionaries + std::vector> messages(num_dictionaries()); + for (int i = 0; i < num_dictionaries(); ++i) { + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); + ARROW_ASSIGN_OR_RAISE(messages[i], ReadMessageFromBlock(block)); + } + return ReadDictionaries(messages); + } + + Status ReadDictionaries( + const std::vector>& dictionary_messages) { + DCHECK_EQ(dictionary_messages.size(), static_cast(num_dictionaries())); IpcReadContext context(&dictionary_memo_, options_, swap_endian_); for (int i = 0; i < num_dictionaries(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i))); - RETURN_NOT_OK(ReadOneDictionary(message.get(), context)); - stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); + RETURN_NOT_OK(ReadOneDictionary(i, dictionary_messages[i].get(), context)); } return Status::OK(); } - Status ReadOneDictionary(Message* message, const IpcReadContext& context) { + Status ReadOneDictionary(int dict_index, Message* message, + const IpcReadContext& context) { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; @@ -1586,44 +1624,48 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } else if (kind == DictionaryKind::Delta) { stats_.num_dictionary_deltas.fetch_add(1, std::memory_order_relaxed); } + stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); return Status::OK(); } - void AddDictionaryRanges(std::vector* ranges) const { + Status AddDictionaryRanges(std::vector* ranges) const { // Adds all dictionaries to the range cache for (int i = 0; i < num_dictionaries(); ++i) { - FileBlock block = GetDictionaryBlock(i); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); ranges->push_back({block.offset, block.metadata_length + block.body_length}); } + return Status::OK(); } - void AddMetadataRanges(const std::vector& indices, - std::vector* ranges) { + Status AddMetadataRanges(const std::vector& indices, + std::vector* ranges) { for (int index : indices) { - FileBlock block = GetRecordBatchBlock(static_cast(index)); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ranges->push_back({block.offset, block.metadata_length}); } + return Status::OK(); } Status CacheMetadata(const std::vector& indices) { std::vector ranges; if (!read_dictionaries_) { - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); } - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->Cache(std::move(ranges)); } - void EnsureDictionaryReadStarted() { + Status EnsureDictionaryReadStarted() { if (!dictionary_load_finished_.is_valid()) { read_dictionaries_ = true; std::vector ranges; - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); dictionary_load_finished_ = metadata_cache_->WaitFor(std::move(ranges)).Then([this] { return ReadDictionaries(); }); } + return Status::OK(); } Status WaitForDictionaryReadFinished() { @@ -1641,7 +1683,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Future<> WaitForMetadatas(const std::vector& indices) { std::vector ranges; - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->WaitFor(std::move(ranges)); } @@ -1685,12 +1727,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { const flatbuf::RecordBatch* batch, IpcReadContext context, io::RandomAccessFile* file, std::shared_ptr owned_file, - int64_t block_data_offset) + int64_t block_data_offset, int64_t block_data_length) : schema(std::move(sch)), context(std::move(context)), file(file), owned_file(std::move(owned_file)), - loader(batch, context.metadata_version, context.options, block_data_offset), + loader(batch, context.metadata_version, context.options, block_data_offset, + block_data_length), columns(schema->num_fields()), cache(file, file->io_context(), io::CacheOptions::LazyDefaults()), length(batch->length()) {} @@ -1789,14 +1832,15 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { return dictionary_load_finished_.Then([message_fut] { return message_fut; }) .Then([this, index](const std::shared_ptr& message_obj) -> Future> { - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE(auto message, GetFlatbufMessage(message_obj)); ARROW_ASSIGN_OR_RAISE(auto batch, GetBatchFromMessage(message)); ARROW_ASSIGN_OR_RAISE(auto context, GetIpcReadContext(message, batch)); auto read_context = std::make_shared( schema_, batch, std::move(context), file_, owned_file_, - block.offset + static_cast(block.metadata_length)); + block.offset + static_cast(block.metadata_length), + block.body_length); RETURN_NOT_OK(read_context->CalculateLoadRequest()); return read_context->ReadAsync().Then( [read_context] { return read_context->CreateRecordBatch(); }); @@ -1915,25 +1959,31 @@ Future WholeIpcFileRecordBatchGenerator::operator()() { auto state = state_; if (!read_dictionaries_.is_valid()) { - std::vector>> messages(state->num_dictionaries()); - for (int i = 0; i < state->num_dictionaries(); i++) { - auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i)); - messages[i] = ReadBlock(block); - } - auto read_messages = All(std::move(messages)); - if (executor_) read_messages = executor_->Transfer(read_messages); - read_dictionaries_ = read_messages.Then( - [=](const std::vector>>& maybe_messages) - -> Status { - ARROW_ASSIGN_OR_RAISE(auto messages, - arrow::internal::UnwrapOrRaise(maybe_messages)); - return ReadDictionaries(state.get(), std::move(messages)); - }); + if (state->dictionary_load_finished_.is_valid()) { + // PreBufferMetadata has started reading dictionaries in the background + read_dictionaries_ = state->dictionary_load_finished_; + } else { + // Start reading dictionaries + std::vector>> messages(state->num_dictionaries()); + for (int i = 0; i < state->num_dictionaries(); i++) { + ARROW_ASSIGN_OR_RAISE(auto block, state->GetDictionaryBlock(i)); + messages[i] = ReadBlock(block); + } + auto read_messages = All(std::move(messages)); + if (executor_) read_messages = executor_->Transfer(read_messages); + read_dictionaries_ = read_messages.Then( + [=](const std::vector>>& maybe_messages) + -> Status { + ARROW_ASSIGN_OR_RAISE(auto messages, + arrow::internal::UnwrapOrRaise(maybe_messages)); + return state->ReadDictionaries(messages); + }); + } } if (index_ >= state_->num_record_batches()) { return Future::MakeFinished(IterationTraits::End()); } - auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++)); + ARROW_ASSIGN_OR_RAISE(auto block, state->GetRecordBatchBlock(index_++)); auto read_message = ReadBlock(block); auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; }); // Force transfer. This may be wasteful in some cases, but ensures we get off the @@ -1969,16 +2019,6 @@ Future> WholeIpcFileRecordBatchGenerator::ReadBlock( } } -Status WholeIpcFileRecordBatchGenerator::ReadDictionaries( - RecordBatchFileReaderImpl* state, - std::vector> dictionary_messages) { - IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_); - for (const auto& message : dictionary_messages) { - RETURN_NOT_OK(state->ReadOneDictionary(message.get(), context)); - } - return Status::OK(); -} - Result> WholeIpcFileRecordBatchGenerator::ReadRecordBatch( RecordBatchFileReaderImpl* state, Message* message) { CHECK_HAS_BODY(*message); @@ -2630,6 +2670,14 @@ Status ValidateFuzzBatch(const RecordBatch& batch) { return st; } +Status ValidateFuzzBatch(const RecordBatchWithMetadata& batch) { + if (batch.batch) { + RETURN_NOT_OK(ValidateFuzzBatch(*batch.batch)); + } + // XXX do something with custom metadata? + return Status::OK(); +} + IpcReadOptions FuzzingOptions() { IpcReadOptions options; options.memory_pool = ::arrow::internal::fuzzing_memory_pool(); @@ -2648,12 +2696,12 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status st; while (true) { - std::shared_ptr batch; - RETURN_NOT_OK(batch_reader->ReadNext(&batch)); - if (batch == nullptr) { + ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadNext()); + if (!batch.batch && !batch.custom_metadata) { + // EOS break; } - st &= ValidateFuzzBatch(*batch); + st &= ValidateFuzzBatch(batch); } return st; @@ -2661,20 +2709,36 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status FuzzIpcFile(const uint8_t* data, int64_t size) { auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - std::shared_ptr batch_reader; - ARROW_ASSIGN_OR_RAISE(batch_reader, - RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); - Status st; + Status final_status; - const int n_batches = batch_reader->num_record_batches(); - for (int i = 0; i < n_batches; ++i) { - ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadRecordBatch(i)); - st &= ValidateFuzzBatch(*batch); + auto do_read = [&](bool pre_buffer) { + io::BufferReader buffer_reader(buffer); + ARROW_ASSIGN_OR_RAISE(auto batch_reader, + RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); + if (pre_buffer) { + // Pre-buffer all record batches + RETURN_NOT_OK(batch_reader->PreBufferMetadata(/*indices=*/{})); + } + + const int n_batches = batch_reader->num_record_batches(); + for (int i = 0; i < n_batches; ++i) { + RecordBatchWithMetadata batch; + auto st = batch_reader->ReadRecordBatchWithCustomMetadata(i).Value(&batch); + final_status &= st; + if (!st.ok()) { + continue; + } + final_status &= ValidateFuzzBatch(batch); + } + return Status::OK(); + }; + + for (const bool pre_buffer : {false, true}) { + final_status &= do_read(pre_buffer); } - return st; + return final_status; } Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 02e6b816c0b..ceca6d9e434 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -368,19 +369,27 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo return builder.Finish(out); } -template -static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, - MemoryPool* pool, - std::shared_ptr* out) { - BuilderType builder(pool); +template BuilderType> +static Result> MakeBinaryArrayWithUniqueValues( + BuilderType builder, int64_t length, bool include_nulls) { + if constexpr (std::is_base_of_v) { + // Try to emit several variadic buffers by choosing a small block size. + builder.SetBlockSize(512); + } for (int64_t i = 0; i < length; ++i) { if (include_nulls && (i % 7 == 0)) { RETURN_NOT_OK(builder.AppendNull()); } else { - RETURN_NOT_OK(builder.Append(std::to_string(i))); + // Make sure that some strings are long enough to have non-inline binary views + const auto base = std::to_string(i); + std::string value; + for (int64_t j = 0; j < 3 * (i % 10); ++j) { + value += base; + } + RETURN_NOT_OK(builder.Append(value)); } } - return builder.Finish(out); + return builder.Finish(); } Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, @@ -390,22 +399,22 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_n ArrayVector arrays; FieldVector fields; - auto AppendColumn = [&](auto& MakeArray) { - arrays.emplace_back(); - RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); - - const auto& type = arrays.back()->type(); - fields.push_back(field(type->ToString(), type)); + auto AppendColumn = [&](auto builder) { + ARROW_ASSIGN_OR_RAISE(auto array, MakeBinaryArrayWithUniqueValues( + std::move(builder), length, with_nulls)); + arrays.push_back(array); + fields.push_back(field(array->type()->ToString(), array->type())); return Status::OK(); }; - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + auto pool = default_memory_pool(); + RETURN_NOT_OK(AppendColumn(StringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeStringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeBinaryBuilder(pool))); if (with_view_types) { - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(StringViewBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryViewBuilder(pool))); } *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f68d2dcb619..e3582056ead 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2575,6 +2575,16 @@ constexpr bool may_have_validity_bitmap(Type::type id) { } } +constexpr bool has_variadic_buffers(Type::type id) { + switch (id) { + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return true; + default: + return false; + } +} + ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.") constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); } diff --git a/cpp/src/arrow/util/int_util_overflow.h b/cpp/src/arrow/util/int_util_overflow.h index 93066fecafa..69714a935a4 100644 --- a/cpp/src/arrow/util/int_util_overflow.h +++ b/cpp/src/arrow/util/int_util_overflow.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include #include "arrow/status.h" @@ -162,6 +164,37 @@ NON_GENERIC_OPS_WITH_OVERFLOW(DivideWithOverflow) #undef NON_GENERIC_OPS_WITH_OVERFLOW #undef NON_GENERIC_OP_WITH_OVERFLOW +// Convenience functions over an arbitrary number of arguments +template +std::optional AddWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(AddWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + +template +std::optional MultiplyWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(MultiplyWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + // Define function NegateWithOverflow with the signature `bool(T u, T* out)` // where T is a signed integer type. On overflow, these functions return true. // Otherwise, false is returned and `out` is updated with the result of the diff --git a/cpp/src/arrow/util/int_util_test.cc b/cpp/src/arrow/util/int_util_test.cc index 7217c1097e4..cffa4e9d15e 100644 --- a/cpp/src/arrow/util/int_util_test.cc +++ b/cpp/src/arrow/util/int_util_test.cc @@ -649,5 +649,23 @@ TYPED_TEST(TestAddWithOverflow, Basics) { this->CheckOk(almost_min, almost_max + T{2}, T{1}); } +TEST(AddWithOverflow, Variadic) { + ASSERT_EQ(AddWithOverflow({}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 3}), 6); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), std::nullopt); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), 128); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), 128); +} + +TEST(MultiplyWithOverflow, Variadic) { + ASSERT_EQ(MultiplyWithOverflow({}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({1, 2, 3, 4}), 24); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), 128); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), 128); +} + } // namespace internal } // namespace arrow From b63025d6e3473e3fb64186d82b4b24b283e6169a Mon Sep 17 00:00:00 2001 From: Jianfeng Mao <4297243+jmao-denver@users.noreply.github.com> Date: Mon, 26 Jan 2026 18:23:21 -0700 Subject: [PATCH 23/36] GH-48966: [C++] Fix cookie duplication in the Flight SQL ODBC driver and the Flight Client (#48967) ### Rationale for this change The bug breaks a Flight SQL server that refreshens the auth token when cookie authentication is enabled ### What changes are included in this PR? 1. In the ODBC layer, removed the code that adds a 2nd ClientCookieMiddlewareFactory in the client options (the 1st one is registered in `BuildFlightClientOptions`). This fixes the issue of the duplicate header cookie fields. 2. In the flight client layer, uses the case-insensitive equality comparator instead of the case-insensitive less-than comparator for the cookies cache which is an unordered map. This fixes the issue of duplicate cookie keys. ### Are these changes tested? Manually on Windows, and CI ### Are there any user-facing changes? No * GitHub Issue: #48966 Authored-by: jianfengmao Signed-off-by: David Li --- cpp/src/arrow/flight/cookie_internal.cc | 5 +++++ cpp/src/arrow/flight/cookie_internal.h | 8 +++++++- .../flight/sql/odbc/odbc_impl/flight_sql_connection.cc | 3 --- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc index 99fa8b238dd..df09a77afb7 100644 --- a/cpp/src/arrow/flight/cookie_internal.cc +++ b/cpp/src/arrow/flight/cookie_internal.cc @@ -64,6 +64,11 @@ size_t CaseInsensitiveHash::operator()(const std::string& key) const { return std::hash{}(upper_string); } +bool CaseInsensitiveEqual::operator()(const std::string& lhs, + const std::string& rhs) const { + return strcasecmp(lhs.c_str(), rhs.c_str()) == 0; +} + Cookie Cookie::Parse(std::string_view cookie_header_value) { // Parse the cookie string. If the cookie has an expiration, record it. // If the cookie has a max-age, calculate the current time + max_age and set that as diff --git a/cpp/src/arrow/flight/cookie_internal.h b/cpp/src/arrow/flight/cookie_internal.h index 62c0390c585..98b936edb33 100644 --- a/cpp/src/arrow/flight/cookie_internal.h +++ b/cpp/src/arrow/flight/cookie_internal.h @@ -41,6 +41,12 @@ class ARROW_FLIGHT_EXPORT CaseInsensitiveComparator { bool operator()(const std::string& t1, const std::string& t2) const; }; +/// \brief Case insensitive equality comparator for use by unordered cookie map. +class ARROW_FLIGHT_EXPORT CaseInsensitiveEqual { + public: + bool operator()(const std::string& lhs, const std::string& rhs) const; +}; + /// \brief Case insensitive hasher for use by cookie caching map. Cookies are not /// case-sensitive. class ARROW_FLIGHT_EXPORT CaseInsensitiveHash { @@ -117,7 +123,7 @@ class ARROW_FLIGHT_EXPORT CookieCache { // Mutex must be used to protect cookie cache. std::mutex mutex_; - std::unordered_map + std::unordered_map cookies; }; diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc index 422c45fc059..8b2b564d8db 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc @@ -157,9 +157,6 @@ void FlightSqlConnection::Connect(const ConnPropertyMap& properties, client_options_ = BuildFlightClientOptions(properties, missing_attr, flight_ssl_configs); - const std::shared_ptr& cookie_factory = GetCookieFactory(); - client_options_.middleware.push_back(cookie_factory); - std::unique_ptr flight_client; ThrowIfNotOK(FlightClient::Connect(location, client_options_).Value(&flight_client)); PopulateMetadataSettings(properties); From dfd48e681abdaede457c84552d618a6aa544b0b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 29 Jan 2026 15:10:59 +0100 Subject: [PATCH 24/36] GH-48983: [Packaging][Python] Build wheel from sdist using build and add check to validate LICENSE.txt and NOTICE.txt are part of the wheel contents (#48988) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the files are missing from the published wheels. - Ensure the license and notice files are part of the wheels - Use build frontend to build wheels - Build wheel from sdist Yes, via archery. I've validated all wheels will fail with the new check if LICENSE.txt or NOTICE.txt are missing: ``` AssertionError: LICENSE.txt is missing from the wheel. ``` No * GitHub Issue: #48983 Lead-authored-by: Raúl Cumplido Co-authored-by: Antoine Pitrou Co-authored-by: Rok Mihevc Signed-off-by: Raúl Cumplido --- .env | 4 ++-- ci/scripts/python_wheel_macos_build.sh | 2 +- ci/scripts/python_wheel_validate_contents.py | 4 ++++ ci/scripts/python_wheel_windows_build.bat | 2 +- ci/scripts/python_wheel_xlinux_build.sh | 2 +- python/requirements-wheel-build.txt | 1 + 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.env b/.env index dad867f8f66..6f830ca229b 100644 --- a/.env +++ b/.env @@ -102,8 +102,8 @@ VCPKG="4334d8b4c8916018600212ab4dd4bbdc343065d1" # 2025.09.17 Release # ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-10-13 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-10-13 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2026-01-27 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2026-01-27 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bd61154430e..2234fc6f310 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -177,7 +177,7 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation popd echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e6..75815dadb85 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -33,6 +33,10 @@ def validate_wheel(path): ) ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + for filename in ('LICENSE.txt', 'NOTICE.txt'): + assert any(info.filename.split("/")[-1] == filename + for info in f.filelist), \ + f"{filename} is missing from the wheel." print(f"The wheel: {wheels[0]} seems valid.") diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b4b7fed99fd..fc256d72785 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -133,7 +133,7 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM Build wheel -%PYTHON_CMD% setup.py bdist_wheel || exit /B 1 +%PYTHON_CMD% -m build --sdist --wheel . --no-isolation || exit /B 1 @REM Repair the wheel with delvewheel @REM diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b3..ceebbc5ad01 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -167,7 +167,7 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation echo "=== Strip symbols from wheel ===" mkdir -p dist/temp-fix-wheel diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index ac6388762b4..769435f4dd8 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,3 +1,4 @@ +build cython>=3.1 numpy>=2.0.0 setuptools_scm From 073caf4dd7094699b2aa8837481bf0c2926e6910 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 29 Jan 2026 18:31:08 +0100 Subject: [PATCH 25/36] GH-49059: [C++] Fix issues found by OSS-Fuzz in IPC reader (#49060) ### Rationale for this change Fix two issues found by OSS-Fuzz in the IPC reader: * a controlled abort on invalid IPC metadata: https://oss-fuzz.com/testcase-detail/5301064831401984 * a nullptr dereference on invalid IPC metadata: https://oss-fuzz.com/testcase-detail/5091511766417408 None of these two issues is a security issue. ### Are these changes tested? Yes, by new unit tests and new fuzz regression files. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #49059 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/ipc/reader.cc | 7 ++++--- cpp/src/arrow/record_batch.cc | 16 ++++++++++------ cpp/src/arrow/record_batch_test.cc | 16 +++++++++++++++- testing | 2 +- 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index f1571f76c24..046eacb6ced 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -245,7 +245,7 @@ class ArrayLoader { } Status GetBuffer(int buffer_index, std::shared_ptr* out) { - auto buffers = metadata_->buffers(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); @@ -262,7 +262,9 @@ class ArrayLoader { Result GetVariadicCount(int i) { auto* variadic_counts = metadata_->variadicBufferCounts(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (i >= static_cast(variadic_counts->size())) { return Status::IOError("variadic_count_index out of range."); } @@ -272,8 +274,7 @@ class ArrayLoader { } // Detect an excessive variadic buffer count to avoid potential memory blowup // (GH-48900). - const auto max_buffer_count = - static_cast(metadata_->buffers()->size()) - buffer_index_; + const auto max_buffer_count = static_cast(buffers->size()) - buffer_index_; if (count > max_buffer_count) { return Status::IOError("variadic buffer count exceeds available number of buffers"); } diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 1162b4c3bb0..12e0f553b74 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -266,10 +266,13 @@ Result> RecordBatch::FromStructArray( namespace { Status ValidateColumnLength(const RecordBatch& batch, int i) { - const auto& array = *batch.column(i); - if (ARROW_PREDICT_FALSE(array.length() != batch.num_rows())) { + // This function is part of the validation code path and should + // be robust against invalid data, but `column()` would call MakeArray() + // that can abort on invalid data. + const auto& array = *batch.column_data(i); + if (ARROW_PREDICT_FALSE(array.length != batch.num_rows())) { return Status::Invalid("Number of rows in column ", i, - " did not match batch: ", array.length(), " vs ", + " did not match batch: ", array.length, " vs ", batch.num_rows()); } return Status::OK(); @@ -455,11 +458,12 @@ namespace { Status ValidateBatch(const RecordBatch& batch, bool full_validation) { for (int i = 0; i < batch.num_columns(); ++i) { RETURN_NOT_OK(ValidateColumnLength(batch, i)); - const auto& array = *batch.column(i); + // See ValidateColumnLength about avoiding a ArrayData -> Array conversion + const auto& array = *batch.column_data(i); const auto& schema_type = batch.schema()->field(i)->type(); - if (!array.type()->Equals(schema_type)) { + if (!array.type->Equals(schema_type)) { return Status::Invalid("Column ", i, - " type not match schema: ", array.type()->ToString(), " vs ", + " type not match schema: ", array.type->ToString(), " vs ", schema_type->ToString()); } const auto st = full_validation ? internal::ValidateArrayFull(array) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4516b808a84..a037d7261ef 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -318,7 +318,6 @@ TEST_F(TestRecordBatch, Validate) { auto a3 = gen.ArrayOf(int16(), 5); auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK(b1->ValidateFull()); // Length mismatch @@ -328,6 +327,21 @@ TEST_F(TestRecordBatch, Validate) { // Type mismatch auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); ASSERT_RAISES(Invalid, b3->ValidateFull()); + + // Invalid column data (nulls in map key array) that would abort on MakeArray + auto map_field = field("f", map(utf8(), int32())); + schema = ::arrow::schema({map_field}); + auto map_key_data = ArrayFromJSON(utf8(), "[null]")->data(); + auto map_item_data = ArrayFromJSON(int32(), "[null]")->data(); + auto map_data = ArrayData::Make(map_field->type(), /*length=*/1, /*buffers=*/{nullptr}, + /*child_data=*/{map_key_data, map_item_data}); + + auto b4 = RecordBatch::Make(schema, /*num_rows=*/map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b4->ValidateFull()); + + // Length mismatch with a column data that would also fail on MakeArray + auto b5 = RecordBatch::Make(schema, /*num_rows=*/1 + map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b5->Validate()); } TEST_F(TestRecordBatch, Slice) { diff --git a/testing b/testing index 7b641152dcb..df428ddaa22 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 7b641152dcb0f9e197ebe24a1986151849250959 +Subproject commit df428ddaa22d94dfb525af4c0951f3dafb463795 From d6d2a890ff15b351d8c7bfbf4740cf8cf8b39568 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 15 Jan 2026 16:17:05 -0800 Subject: [PATCH 26/36] MINOR: [CI][Release] Fix incorrect path in release_candidate.yml (#48871) ### Rationale for this change I noticed a reference to a `release_candidate.sh` in the `paths` field in `release_candidate.yml` which is a file that doesn't exist. I think this was just a typo made during refactoring. ### What changes are included in this PR? Corrected `paths` list entry. ### Are these changes tested? No. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Sutou Kouhei --- .github/workflows/release_candidate.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml index e4849beeeb9..57620831bc5 100644 --- a/.github/workflows/release_candidate.yml +++ b/.github/workflows/release_candidate.yml @@ -25,12 +25,12 @@ on: tags: - "apache-arrow-*-rc*" paths: - - ".github/workflows/release_candidate.sh" + - ".github/workflows/release_candidate.yml" - "dev/release/utils-create-release-tarball.sh" - "dev/release/utils-generate-checksum.sh" pull_request: paths: - - ".github/workflows/release_candidate.sh" + - ".github/workflows/release_candidate.yml" - "dev/release/utils-create-release-tarball.sh" - "dev/release/utils-generate-checksum.sh" From ab2c0ad6b23d05d5f77fc8a34d5a1c4baaacb0a4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 29 Jan 2026 12:41:41 +0100 Subject: [PATCH 27/36] GH-49044: [CI][Python] Fix test_download_tzdata_on_windows by adding required user-agent on urllib request (#49052) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change See: #49044 ### What changes are included in this PR? Urllib now request with `"user-agent": "pyarrow"` ### Are these changes tested? It's a CI fix. ### Are there any user-facing changes? No, just a CI test fix. * GitHub Issue: #49044 Authored-by: Rok Mihevc Signed-off-by: Raúl Cumplido --- python/pyarrow/util.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 5878d1f9026..a95826e1c00 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -231,8 +231,9 @@ def _break_traceback_cycle_from_frame(frame): def _download_urllib(url, out_path): - from urllib.request import urlopen - with urlopen(url) as response: + from urllib.request import urlopen, Request + req = Request(url, headers={'User-Agent': 'pyarrow'}) + with urlopen(req) as response: with open(out_path, 'wb') as f: f.write(response.read()) @@ -264,11 +265,13 @@ def download_tzdata_on_windows(): # Try to download the files with requests and then fall back to urllib. This # works around possible issues in certain older environment (GH-45295) try: - _download_requests(tzdata_url, tzdata_compressed_path) - _download_requests(windows_zones_url, windows_zones_path) + import requests # noqa: F401 + download_fn = _download_requests except ImportError: - _download_urllib(tzdata_url, tzdata_compressed_path) - _download_urllib(windows_zones_url, windows_zones_path) + download_fn = _download_urllib + + download_fn(tzdata_url, tzdata_compressed_path) + download_fn(windows_zones_url, windows_zones_path) assert os.path.exists(tzdata_compressed_path) assert os.path.exists(windows_zones_path) From f9376e4721b81bad9fe3fe840926a3283f95ee30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 3 Feb 2026 14:59:44 -0300 Subject: [PATCH 28/36] GH-49003: [C++] Don't consider `out_of_range` an error in float parsing (#49095) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change This PR restores the behavior previous to version 23 for floating-point parsing on overflow and subnormal. `fast_float` didn't assign an error code on overflow in version `3.10.1` and assigned `±Inf` on overflow and `0.0` on subnormal. With the update to version `8.1`, it started to assign `std::errc::result_out_of_range` in such cases. ### What changes are included in this PR? Ignores `std::errc::result_out_of_range` and produce `±Inf` / `0.0` as appropriate instead of failing the conversion. ### Are these changes tested? Yes. Created tests for overflow with positive and negative signed mantissa, and also created tests for subnormal, all of them for binary{16,32,64}. ### Are there any user-facing changes? It's a user facing change. The CSV reader on version `libarrow==23` was assigning them as strings, while before it was parsing it as `0` or `+- inf`. With this patch, the CSV reader in PyArrow outputs: ```python >>> import pyarrow >>> import pyarrow.csv >>> import io >>> table = pyarrow.csv.read_csv(io.BytesIO(f"data\n10E-617\n10E617\n-10E617".encode())) >>> print(table) pyarrow.Table data: double ---- data: [[0,inf,-inf]] ``` Closes #49003 * GitHub Issue: #49003 Authored-by: Alvaro-Kothe Signed-off-by: Antoine Pitrou --- cpp/src/arrow/util/value_parsing.cc | 15 ++++++++++++--- cpp/src/arrow/util/value_parsing_test.cc | 12 ++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 1a8e8066d70..0cc71f276df 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -35,7 +35,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, float* out) ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } bool StringToFloat(const char* s, size_t length, char decimal_point, double* out) { @@ -43,7 +46,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } // Half float @@ -53,7 +59,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, Float16* ou float temp_out; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); - const bool ok = res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + const bool ok = is_valid_number && consumed_entire_string; if (ok) { *out = Float16::FromFloat(temp_out); } diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index a67f1d97f17..3525a010b63 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -141,6 +141,10 @@ TEST(StringConversion, ToFloat) { AssertConversion("0", 0.0f); AssertConversion("-0.0", -0.0f); AssertConversion("-1e20", -1e20f); + AssertConversion("4e38", std::numeric_limits::infinity()); + AssertConversion("-4e38", -std::numeric_limits::infinity()); + AssertConversion("1e-46", 0.0f); + AssertConversion("-1e-46", -0.0f); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -166,6 +170,10 @@ TEST(StringConversion, ToDouble) { AssertConversion("0", 0); AssertConversion("-0.0", -0.0); AssertConversion("-1e100", -1e100); + AssertConversion("2e308", std::numeric_limits::infinity()); + AssertConversion("-2e308", -std::numeric_limits::infinity()); + AssertConversion("1e-325", 0.0); + AssertConversion("-1e-325", -0.0); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -185,6 +193,10 @@ TEST(StringConversion, ToHalfFloat) { AssertConversion("0", Float16(0.0f)); AssertConversion("-0.0", Float16(-0.0f)); AssertConversion("-1e15", Float16(-1e15)); + AssertConversion("7e4", Float16::FromBits(0x7c00)); + AssertConversion("-7e4", Float16::FromBits(0xfc00)); + AssertConversion("1e-9", Float16(0.0f)); + AssertConversion("-1e-9", Float16(-0.0f)); AssertConversion("+Infinity", Float16::FromBits(0x7c00)); AssertConversion("-Infinity", Float16::FromBits(0xfc00)); AssertConversion("Infinity", Float16::FromBits(0x7c00)); From e4f922b1621b6c833f583cf26500f115ab5bc483 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 5 Feb 2026 09:50:09 +0100 Subject: [PATCH 29/36] GH-49138: [Packaging][Python] Remove nightly cython install from manylinux wheel dockerfile (#49139) We use nightlies version of Cython for free-threaded PyArrow wheels and they are currently failing, see https://github.com/apache/arrow/issues/49138 Nightly Cython install is removed and Cython is installed via [requirements file](https://github.com/apache/arrow/blob/main/python/requirements-wheel-build.txt#L2). Tes. No. * GitHub Issue: #49138 Authored-by: AlenkaF Signed-off-by: AlenkaF --- ci/docker/python-wheel-manylinux.dockerfile | 5 ----- ci/scripts/python_wheel_macos_build.sh | 10 ---------- 2 files changed, 15 deletions(-) diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index ffdd0d44f5f..54033500773 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -113,10 +113,5 @@ RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] -# Remove once there are released Cython wheels for 3.13 free-threaded available -RUN if [ "${python_abi_tag}" = "cp313t" ]; then \ - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary ; \ - fi - COPY python/requirements-wheel-build.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 2234fc6f310..0990a842e94 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -46,19 +46,9 @@ else exit 1 fi -echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" -export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') - -# Remove once there are released Cython wheels for 3.13 free-threaded available -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary -fi - pip install \ --force-reinstall \ --only-binary=:all: \ - --target $PIP_SITE_PACKAGES \ --upgrade \ -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" From 147bcd6d8f3fef05dd06968d3b60c17721c60334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 6 Feb 2026 12:27:31 +0100 Subject: [PATCH 30/36] GH-49156: [Python] Require GIL for string comparison (#49161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With Cython 3.3.0.a0 this failed. After some discussion it seems that this should have always had to require the GIL. ### What changes are included in this PR? Moving statement out of the `with nogil` context manager. ### Are these changes tested? Existing CI builds pyarrow. ### Are there any user-facing changes? No * GitHub Issue: #49156 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- python/pyarrow/table.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..d8bdea76413 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -6316,8 +6316,8 @@ def concat_tables(tables, MemoryPool memory_pool=None, str promote_options="none "default" if promote_options == "none" else promote_options ) + options.unify_schemas = promote_options != "none" with nogil: - options.unify_schemas = promote_options != "none" c_result_table = GetResultValue( ConcatenateTables(c_tables, options, pool)) From 1bea06ad4e14d75dd97a78a0148cd9cf6f4df0bc Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 28 Jan 2026 22:54:15 +0100 Subject: [PATCH 31/36] GH-49024: [CI] Update Debian version in `.env` (#49032) ### Rationale for this change Default Debian version in `.env` now maps to oldstable, we should use stable instead. Also prune entries that are not used anymore. ### Are these changes tested? By existing CI jobs. ### Are there any user-facing changes? No. * GitHub Issue: #49024 Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- .env | 5 +- ci/docker/debian-12-cpp.dockerfile | 149 ---------------------------- ci/docker/debian-13-cpp.dockerfile | 11 +- ci/docker/linux-apt-docs.dockerfile | 8 +- cpp/src/arrow/memory_pool_test.cc | 8 +- dev/tasks/tasks.yml | 14 +-- 6 files changed, 18 insertions(+), 177 deletions(-) delete mode 100644 ci/docker/debian-12-cpp.dockerfile diff --git a/.env b/.env index 6f830ca229b..14ed93bfe9b 100644 --- a/.env +++ b/.env @@ -52,7 +52,7 @@ ULIMIT_CORE=-1 # Default versions for platforms ALMALINUX=8 ALPINE_LINUX=3.22 -DEBIAN=12 +DEBIAN=13 FEDORA=42 UBUNTU=22.04 @@ -61,11 +61,9 @@ CLANG_TOOLS=18 CMAKE=3.26.0 CUDA=11.7.1 DASK=latest -DOTNET=8.0 GCC= HDFS=3.2.1 JDK=11 -KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=18 MAVEN=3.8.7 @@ -79,7 +77,6 @@ PYTHON_IMAGE_TAG=3.10 PYTHON_ABI_TAG=cp310 R=4.5 SPARK=master -TURBODBC=latest # These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-release:latest R_IMAGE=ubuntu-release diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile deleted file mode 100644 index 44c845bb17e..00000000000 --- a/ci/docker/debian-12-cpp.dockerfile +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch=amd64 -FROM ${arch}/debian:12 -ARG arch - -ENV DEBIAN_FRONTEND noninteractive - -ARG llvm -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - wget && \ - if [ ${llvm} -ge 17 ]; then \ - wget -O /usr/share/keyrings/llvm-snapshot.asc \ - https://apt.llvm.org/llvm-snapshot.gpg.key && \ - (echo "Types: deb"; \ - echo "URIs: https://apt.llvm.org/$(lsb_release --codename --short)/"; \ - echo "Suites: llvm-toolchain-$(lsb_release --codename --short)-${llvm}"; \ - echo "Components: main"; \ - echo "Signed-By: /usr/share/keyrings/llvm-snapshot.asc") | \ - tee /etc/apt/sources.list.d/llvm.sources; \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ccache \ - clang-${llvm} \ - cmake \ - curl \ - g++ \ - gcc \ - gdb \ - git \ - libbenchmark-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgflags-dev \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ - liblz4-dev \ - libnghttp2-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libpsl-dev \ - libre2-dev \ - librtmp-dev \ - libsnappy-dev \ - libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libxml2-dev \ - libzstd-dev \ - llvm-${llvm}-dev \ - make \ - ninja-build \ - nlohmann-json3-dev \ - npm \ - patch \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - python3-venv \ - rapidjson-dev \ - rsync \ - tzdata \ - zlib1g-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_azurite.sh - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -# Prioritize system packages and local installation. -# -# The following dependencies will be downloaded due to missing/invalid packages -# provided by the distribution: -# - opentelemetry-cpp-dev is not packaged -ENV ARROW_ACERO=ON \ - ARROW_AZURE=ON \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=ON \ - ARROW_FLIGHT_SQL=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HOME=/usr/local \ - ARROW_JEMALLOC=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - Azure_SOURCE=BUNDLED \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - opentelemetry_cpp_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 \ - xsimd_SOURCE=BUNDLED diff --git a/ci/docker/debian-13-cpp.dockerfile b/ci/docker/debian-13-cpp.dockerfile index ca96b4177ff..1ea153f6872 100644 --- a/ci/docker/debian-13-cpp.dockerfile +++ b/ci/docker/debian-13-cpp.dockerfile @@ -55,26 +55,18 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ - libc-ares-dev \ libcurl4-openssl-dev \ libgflags-dev \ libgmock-dev \ libgoogle-glog-dev \ libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ liblz4-dev \ - libnghttp2-dev \ + libopentelemetry-proto-dev \ libprotobuf-dev \ libprotoc-dev \ - libpsl-dev \ libre2-dev \ - librtmp-dev \ libsnappy-dev \ libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ @@ -96,6 +88,7 @@ RUN apt-get update -y -q && \ rapidjson-dev \ rsync \ tzdata \ + tzdata-legacy \ zlib1g-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index b9f7c716e52..52090f8bb82 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -31,11 +31,9 @@ RUN apt-get update -y && \ lsb-release && \ gpg --keyserver keyserver.ubuntu.com \ --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ - gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ - gpg --no-default-keyring \ - --keyring /usr/share/keyrings/cran.gpg \ - --import - && \ - echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + gpg --armor --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + tee /usr/share/keyrings/cran.asc && \ + echo "deb [signed-by=/usr/share/keyrings/cran.asc] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ tee /etc/apt/sources.list.d/cran.list && \ if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ sed -i \ diff --git a/cpp/src/arrow/memory_pool_test.cc b/cpp/src/arrow/memory_pool_test.cc index 20006ebeb49..0af1ed2d9ec 100644 --- a/cpp/src/arrow/memory_pool_test.cc +++ b/cpp/src/arrow/memory_pool_test.cc @@ -242,10 +242,10 @@ TEST(Jemalloc, GetAllocationStats) { // Check allocated stats change due to allocation ASSERT_NEAR(allocated - allocated0, 70000, 50000); - ASSERT_NEAR(active - active0, 100000, 90000); - ASSERT_NEAR(metadata - metadata0, 500, 460); - ASSERT_NEAR(resident - resident0, 120000, 110000); - ASSERT_NEAR(mapped - mapped0, 100000, 90000); + ASSERT_GE(active - active0, allocated - allocated0); + ASSERT_GT(metadata, metadata0); + ASSERT_GE(resident - resident0, allocated - allocated0); + ASSERT_GE(mapped - mapped0, allocated - allocated0); ASSERT_NEAR(retained - retained0, 0, 40000); ASSERT_NEAR(thread_peak_read - thread_peak_read0, 1024, 700); diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 931b6da784d..97843d2ef0c 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -451,7 +451,7 @@ tasks: flags: -e CC=gcc-14 -e CXX=g++-14 -e RapidJSON_SOURCE=BUNDLED image: ubuntu-cpp -{% for debian_version in ["12"] %} +{% for debian_version in ["13"] %} test-debian-{{ debian_version }}-cpp-amd64: ci: github template: docker-tests/github.linux.yml @@ -589,23 +589,25 @@ tasks: UBUNTU: 22.04 image: ubuntu-python-313-freethreading - test-debian-12-python-3-amd64: +{% for debian_version in ["13"] %} + test-debian-{{ debian_version }}-python-3-amd64: ci: github template: docker-tests/github.linux.yml params: env: - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" image: debian-python - test-debian-12-python-3-i386: + test-debian-{{ debian_version }}-python-3-i386: ci: github template: docker-tests/github.linux.yml params: env: ARCH: i386 - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" flags: "-e ARROW_S3=OFF -e ARROW_GANDIVA=OFF" image: debian-python +{% endfor %} test-ubuntu-22.04-python-3: ci: github @@ -756,7 +758,7 @@ tasks: template: r/github.macos.m1san.yml # be sure to update binary-task.rb when upgrading Debian - test-debian-12-docs: + test-debian-13-docs: ci: github template: docs/github.linux.yml params: From 985621dbfcf3fd2061889e43c50b59825df84f3f Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 14 Jan 2026 16:20:27 -0600 Subject: [PATCH 32/36] GH-48817 [R][C++] Bump C++20 in R build infrastructure (#48819) Resolves: #48817 ### Rationale for this change Keep R build infrastructure inline with our C++ version ### What changes are included in this PR? Mostly `s/CXX17/CXX20/g` ### Are these changes tested? Yes, lots of CI ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** (If there are any breaking changes to public APIs, please explain which changes are breaking. If not, you can remove this.) **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #48817 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- compose.yaml | 4 ++-- r/DESCRIPTION | 2 +- r/README.md | 2 +- r/configure | 22 +++++++--------------- r/configure.win | 8 -------- r/src/Makevars.in | 2 +- r/src/Makevars.ucrt | 2 +- r/src/compute.cpp | 9 +++++---- r/tools/nixlibs.R | 13 ++++++++----- r/vignettes/install.Rmd | 4 ++-- 10 files changed, 28 insertions(+), 40 deletions(-) diff --git a/compose.yaml b/compose.yaml index 13f446ff030..31bc5c81b95 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1719,9 +1719,9 @@ services: cache_from: - ${REPO}:amd64-ubuntu-r-valgrind args: - base: wch1/r-debug:latest + base: rhub/valgrind:latest cmake: ${CMAKE} - r_bin: RDvalgrind + r_bin: R tz: ${TZ} environment: <<: [*common, *ccache, *sccache] diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 0ac5e36ea6d..3252e960c3a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -28,7 +28,7 @@ URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/ BugReports: https://github.com/apache/arrow/issues Encoding: UTF-8 Language: en-US -SystemRequirements: C++17; for AWS S3 support on Linux, libcurl and openssl (optional); +SystemRequirements: C++20; for AWS S3 support on Linux, libcurl and openssl (optional); cmake >= 3.26 (build-time only, and only for full source build) Biarch: true Imports: diff --git a/r/README.md b/r/README.md index 1ab9206f119..bb5d137dc88 100644 --- a/r/README.md +++ b/r/README.md @@ -44,7 +44,7 @@ There are some special cases to note: - On Linux the installation process can sometimes be more involved because CRAN does not host binaries for Linux. For more information please see the [installation guide](https://arrow.apache.org/docs/r/articles/install.html). -- If you are compiling arrow from source, please note that as of version 10.0.0, arrow requires C++17 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.0 or later. On CentOS 7, it means you need to install a newer compiler than the default system compiler gcc. See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. +- If you are compiling arrow from source, please note that as of version 23.0.0, arrow requires C++20 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.3 or later (though R 4.2 has incomplete support and might work with special configuration). See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. - Development versions of arrow are released nightly. For information on how to install nightly builds please see the [installing nightly builds](https://arrow.apache.org/docs/r/articles/install_nightly.html) article. diff --git a/r/configure b/r/configure index f64a3673f97..9e92eb6b47f 100755 --- a/r/configure +++ b/r/configure @@ -86,10 +86,10 @@ if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then ${R_HOME}/bin/Rscript data-raw/codegen.R fi -# Arrow requires C++17, so check for it -if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then +# Arrow requires C++20, so check for it +if [ ! "`${R_HOME}/bin/R CMD config CXX20`" ]; then echo "------------------------- NOTE ---------------------------" - echo "Cannot install arrow: a C++17 compiler is required." + echo "Cannot install arrow: a C++20 compiler is required." echo "See https://arrow.apache.org/docs/r/articles/install.html" echo "---------------------------------------------------------" exit 1 @@ -260,14 +260,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs @@ -408,11 +400,11 @@ else fi # Test that we can compile something with those flags -CXX17="`${R_HOME}/bin/R CMD config CXX17` -E" -CXX17FLAGS=`"${R_HOME}"/bin/R CMD config CXX17FLAGS` -CXX17STD=`"${R_HOME}"/bin/R CMD config CXX17STD` +CXX20="`${R_HOME}/bin/R CMD config CXX20` -E" +CXX20FLAGS=`"${R_HOME}"/bin/R CMD config CXX20FLAGS` +CXX20STD=`"${R_HOME}"/bin/R CMD config CXX20STD` CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS` -TEST_CMD="${CXX17} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX17FLAGS} ${CXX17STD} -xc++ -" +TEST_CMD="${CXX20} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX20FLAGS} ${CXX20STD} -xc++ -" TEST_ERROR=$(echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} -o /dev/null 2>&1) if [ $? -eq 0 ]; then diff --git a/r/configure.win b/r/configure.win index 433ef28439a..16c5ec1bee8 100755 --- a/r/configure.win +++ b/r/configure.win @@ -117,14 +117,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs diff --git a/r/src/Makevars.in b/r/src/Makevars.in index af0826faacb..1b7ad08e1cb 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -25,7 +25,7 @@ PKG_CPPFLAGS=@cflags@ # https://bugs.llvm.org/show_bug.cgi?id=39191 # https://www.mail-archive.com/gcc-bugs@gcc.gnu.org/msg534862.html # PKG_CXXFLAGS=$(CXX_VISIBILITY) -CXX_STD=CXX17 +CXX_STD=CXX20 PKG_LIBS=@libs@ all: $(SHLIB) purify diff --git a/r/src/Makevars.ucrt b/r/src/Makevars.ucrt index a91dedc2d55..b72ed64d98e 100644 --- a/r/src/Makevars.ucrt +++ b/r/src/Makevars.ucrt @@ -19,4 +19,4 @@ CRT=-ucrt include Makevars.win # XXX for some reason, this variable doesn't seem propagated from Makevars.win -CXX_STD=CXX17 +CXX_STD=CXX20 diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0777ca8bc72..c8aa903bf06 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -162,12 +162,13 @@ std::shared_ptr make_compute_options( // false means descending, true means ascending // cpp11 does not support bool here so use int auto orders = cpp11::as_cpp>(options["orders"]); - std::vector keys; + // Use resize + assignment to avoid vector growth operations that trigger + // false positive -Wmaybe-uninitialized warnings in GCC 14 with std::variant + std::vector keys(names.size(), Key("", Order::Ascending)); for (size_t i = 0; i < names.size(); i++) { - keys.push_back( - Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending)); + keys[i] = Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending); } - auto out = std::make_shared(Options(keys)); + auto out = std::make_shared(std::move(keys)); return out; } diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 9d0a2604682..f4ccb4956a8 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -310,11 +310,11 @@ compile_test_program <- function(code) { openssl_dir <- paste0("-I", openssl_root_dir, "/include") } runner <- paste( - R_CMD_config("CXX17"), + R_CMD_config("CXX20"), openssl_dir, R_CMD_config("CPPFLAGS"), - R_CMD_config("CXX17FLAGS"), - R_CMD_config("CXX17STD"), + R_CMD_config("CXX20FLAGS"), + R_CMD_config("CXX20STD"), "-E", "-xc++" ) @@ -565,8 +565,11 @@ build_libarrow <- function(src_dir, dst_dir) { # is found, it will be used by the libarrow build, and this does # not affect how R compiles the arrow bindings. CC = sub("^.*ccache", "", R_CMD_config("CC")), - CXX = paste(sub("^.*ccache", "", R_CMD_config("CXX17")), R_CMD_config("CXX17STD")), - # CXXFLAGS = R_CMD_config("CXX17FLAGS"), # We don't want the same debug symbols + CXX = paste( + sub("^.*ccache", "", R_CMD_config("CXX20")), + R_CMD_config("CXX20STD") + ), + # CXXFLAGS = R_CMD_config("CXX20FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS"), N_JOBS = ncores ) diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 69780bd64df..d9cdcc3885c 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -23,8 +23,8 @@ but there are a few things to note. ### Compilers -As of version 10.0.0, arrow requires a C++17 compiler to build. -For `gcc`, this generally means version 7 or newer. Most contemporary Linux +As of version 22.0.0, arrow requires a C++20 compiler to build. +For `gcc`, this generally means version 10 or newer. Most contemporary Linux distributions have a new enough compiler; however, CentOS 7 is a notable exception, as it ships with gcc 4.8. From 4e16a1aeed83a65e6b49556c2fed8e9061cdf980 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 10 Feb 2026 18:35:43 +0900 Subject: [PATCH 33/36] GH-49159: [C++][Gandiva] Detect overflow in repeat() (#49160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change `repeat()` can only generate `< 2147483647` size output. So output larger than `2147483647` must be rejected. ### What changes are included in this PR? Add overflow check in `repeat()`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #49159 Lead-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- cpp/src/gandiva/precompiled/string_ops.cc | 7 ++++++- cpp/src/gandiva/precompiled/string_ops_test.cc | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 0b31c769c99..3e786d1b112 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -841,7 +841,12 @@ const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_le *out_len = 0; return ""; } - *out_len = repeat_number * in_len; + if (ARROW_PREDICT_FALSE( + arrow::internal::MultiplyWithOverflow(repeat_number, in_len, out_len))) { + gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); + *out_len = 0; + return ""; + } char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index aaa25db0a9f..c418f9077a7 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -387,6 +387,13 @@ TEST(TestStringOps, TestRepeat) { EXPECT_EQ(std::string(out_str, out_len), ""); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Repeat number can't be negative")); ctx.Reset(); + + out_str = repeat_utf8_int32(ctx_ptr, "aa", 2, + std::numeric_limits::max() / 2 + 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Would overflow maximum output size")); + ctx.Reset(); } TEST(TestStringOps, TestCastBoolToVarchar) { From 8f6e55736f60f1f95aee1e8765c6b75ad9589111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 10 Feb 2026 11:44:55 +0100 Subject: [PATCH 34/36] MINOR: [Release] Update CHANGELOG.md for 23.0.1 --- CHANGELOG.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bd105ebc59..3e46901c999 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,42 @@ +# Apache Arrow 23.0.1 (2026-02-08 00:00:00+00:00) + +## Bug Fixes + +* [GH-48160](https://github.com/apache/arrow/issues/48160) - [C++][Gandiva] Pass CPU attributes to LLVM (#48161) +* [GH-48311](https://github.com/apache/arrow/issues/48311) - [C++] Fix OOB memory access in buffered IO (#48322) +* [GH-48637](https://github.com/apache/arrow/issues/48637) - [C++][FlightRPC] ODBC: Disable `absl` deadlock detection (#48747) +* [GH-48856](https://github.com/apache/arrow/issues/48856) - [Release] Update copyright NOTICE year to 2026 (#48857) +* [GH-48858](https://github.com/apache/arrow/issues/48858) - [C++][Parquet] Avoid re-serializing footer for signature verification (#48859) +* [GH-48861](https://github.com/apache/arrow/issues/48861) - [CI] Fix wrong `smtplib.SMTP.send_message` usage (#48876) +* [GH-48880](https://github.com/apache/arrow/issues/48880) - [Ruby] Fix a bug that Arrow::ExecutePlan nodes may be GC-ed (#48919) +* [GH-48885](https://github.com/apache/arrow/issues/48885) - [C++] Add missing curl dependency of `Arrow::arrow_static` CMake target (#48891) +* [GH-48894](https://github.com/apache/arrow/issues/48894) - [Python][C++] Use base Azure::Core::RequestFailedException instead of final Azure::Storage::StorageException and set minimum nodejs on conda env to 16 for Azurite to work (#48895) +* [GH-48900](https://github.com/apache/arrow/issues/48900) - [C++] Avoid memory blowup with excessive variadic buffer count in IPC (#48901) +* [GH-48961](https://github.com/apache/arrow/issues/48961) - [Docs][Python] Doctest fails on pandas 3.0 +* [GH-48965](https://github.com/apache/arrow/issues/48965) - [Python][C++] Compare unique_ptr for CFlightResult or CFlightInfo to nullptr instead of NULL (#48968) +* [GH-48966](https://github.com/apache/arrow/issues/48966) - [C++] Fix cookie duplication in the Flight SQL ODBC driver and the Flight Client (#48967) +* [GH-48983](https://github.com/apache/arrow/issues/48983) - [Packaging][Python] Build wheel from sdist using build and add check to validate LICENSE.txt and NOTICE.txt are part of the wheel contents (#48988) +* [GH-49003](https://github.com/apache/arrow/issues/49003) - [C++] Don't consider `out_of_range` an error in float parsing (#49095) +* [GH-49044](https://github.com/apache/arrow/issues/49044) - [CI][Python] Fix test_download_tzdata_on_windows by adding required user-agent on urllib request (#49052) +* [GH-49059](https://github.com/apache/arrow/issues/49059) - [C++] Fix issues found by OSS-Fuzz in IPC reader (#49060) +* [GH-49137](https://github.com/apache/arrow/issues/49137) - [CI][Release] macOS conda source verification jobs fail to build Arrow C++ +* [GH-49138](https://github.com/apache/arrow/issues/49138) - [Packaging][Python] Remove nightly cython install from manylinux wheel dockerfile (#49139) +* [GH-49156](https://github.com/apache/arrow/issues/49156) - [Python] Require GIL for string comparison (#49161) +* [GH-49159](https://github.com/apache/arrow/issues/49159) - [C++][Gandiva] Detect overflow in repeat() (#49160) + + +## New Features and Improvements + +* [GH-48623](https://github.com/apache/arrow/issues/48623) - [CI][Archery][Dev] Add missing headers to email reports (#48624) +* [GH-48817](https://github.com/apache/arrow/issues/48817) - [R][C++] Bump C++20 in R build infrastructure (#48819) +* [GH-48844](https://github.com/apache/arrow/issues/48844) - [C++] Check IPC Message body length consistency in IPC file (#48845) +* [GH-48924](https://github.com/apache/arrow/issues/48924) - [C++][CI] Fix pre-buffering issues in IPC file reader (#48925) +* [GH-48973](https://github.com/apache/arrow/issues/48973) - [R][C++] Fix RE2 compilation errors under C++20 (#48976) +* [GH-49024](https://github.com/apache/arrow/issues/49024) - [CI] Update Debian version in `.env` (#49032) + + + # Apache Arrow 23.0.0 (2026-01-12 00:00:00+00:00) ## Bug Fixes From c1ae37c4a597f466b1806e65a9e011be1060dfc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 10 Feb 2026 11:45:02 +0100 Subject: [PATCH 35/36] MINOR: [Release] Update .deb/.rpm changelogs for 23.0.1 --- .../linux-packages/apache-arrow-apt-source/debian/changelog | 6 ++++++ .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++++++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 18 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 6c99f51ee2d..23155047455 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:01 -0000 + apache-arrow-apt-source (23.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 0579df694f0..50f67825367 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -85,6 +85,9 @@ else fi %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + * Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 0f18ddaefda..8fae632bbc9 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:01 -0000 + apache-arrow (23.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 7bf8bd556a9..894b56d5244 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -877,6 +877,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + * Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 - New upstream release. From 82a374e5f3de5b744f26591e6cd96de6349c76d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 10 Feb 2026 11:45:08 +0100 Subject: [PATCH 36/36] MINOR: [Release] Update versions for 23.0.1 --- c_glib/meson.build | 2 +- c_glib/vcpkg.json | 2 +- ci/scripts/PKGBUILD | 2 +- cpp/CMakeLists.txt | 2 +- cpp/meson.build | 2 +- cpp/vcpkg.json | 2 +- dev/tasks/homebrew-formulae/apache-arrow-glib.rb | 2 +- dev/tasks/homebrew-formulae/apache-arrow.rb | 2 +- matlab/CMakeLists.txt | 2 +- python/CMakeLists.txt | 2 +- python/pyproject.toml | 2 +- r/DESCRIPTION | 2 +- r/NEWS.md | 2 +- r/pkgdown/assets/versions.html | 4 ++-- r/pkgdown/assets/versions.json | 4 ++-- ruby/red-arrow-cuda/lib/arrow-cuda/version.rb | 2 +- ruby/red-arrow-dataset/lib/arrow-dataset/version.rb | 2 +- ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb | 2 +- ruby/red-arrow-flight/lib/arrow-flight/version.rb | 2 +- ruby/red-arrow-format/lib/arrow-format/version.rb | 2 +- ruby/red-arrow/lib/arrow/version.rb | 2 +- ruby/red-gandiva/lib/gandiva/version.rb | 2 +- ruby/red-parquet/lib/parquet/version.rb | 2 +- 23 files changed, 25 insertions(+), 25 deletions(-) diff --git a/c_glib/meson.build b/c_glib/meson.build index ef020350748..77b64cfbe54 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -32,7 +32,7 @@ project( # * 22.04: 0.61.2 # * 24.04: 1.3.2 meson_version: '>=0.61.2', - version: '23.0.0', + version: '23.0.1', ) version = meson.project_version() diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index b7aa1ce8863..c7276a6e681 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "23.0.0", + "version-string": "23.0.1", "$comment:dependencies": "We can enable gobject-introspection again once it's updated", "dependencies": [ "glib", diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index ff95e15c2f7..8ace741403e 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=23.0.0 +pkgver=23.0.1 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3e0105262e..d9e518b786b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -96,7 +96,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "23.0.0") +set(ARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/meson.build b/cpp/meson.build index 30623eb6541..5632367cb95 100644 --- a/cpp/meson.build +++ b/cpp/meson.build @@ -19,7 +19,7 @@ project( 'arrow', 'cpp', 'c', - version: '23.0.0', + version: '23.0.1', license: 'Apache-2.0', meson_version: '>=1.3.0', default_options: ['c_std=c11', 'warning_level=2', 'cpp_std=c++20'], diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 07d7344e0bc..3d636798234 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version-string": "23.0.0", + "version-string": "23.0.1", "dependencies": [ "abseil", { diff --git a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb index 035fa7b1b84..71737d86453 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb @@ -29,7 +29,7 @@ class ApacheArrowGlib < Formula desc "GLib bindings for Apache Arrow" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0/apache-arrow-23.0.0.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.1/apache-arrow-23.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index a6ee05289f2..e14c0473a6c 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -29,7 +29,7 @@ class ApacheArrow < Formula desc "Columnar in-memory analytics layer designed to accelerate big data" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0/apache-arrow-23.0.0.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.1/apache-arrow-23.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index dbcc4edf792..7c9cd3b5017 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -100,7 +100,7 @@ endfunction() set(CMAKE_CXX_STANDARD 20) -set(MLARROW_VERSION "23.0.0") +set(MLARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") project(mlarrow VERSION "${MLARROW_BASE_VERSION}") diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d550796a7af..bf71387bcd1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -28,7 +28,7 @@ project(pyarrow) # which in turn meant that Py_GIL_DISABLED was not set. set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) -set(PYARROW_VERSION "23.0.0") +set(PYARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" PYARROW_BASE_VERSION "${PYARROW_VERSION}") # Generate SO version and full SO version diff --git a/python/pyproject.toml b/python/pyproject.toml index f137a79c832..45c52cc0c4d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -95,4 +95,4 @@ root = '..' version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' -fallback_version = '23.0.0' +fallback_version = '23.0.1' diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 3252e960c3a..a21d2daacd1 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 23.0.0 +Version: 23.0.1 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 3af9e1185e4..abfafffb2e2 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,7 @@ under the License. --> -# arrow 23.0.0 +# arrow 23.0.1 # arrow 22.0.0.1 ## Minor improvements and fixes diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index 76c30f8f252..e9fdd50a347 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,7 @@ -

23.0.0.9000 (dev)

-

23.0.0 (release)

+

23.0.1.9000 (dev)

+

23.0.1 (release)

22.0.0

21.0.0

20.0.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 8b2f0471fe5..7d22213ef3b 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "23.0.0.9000 (dev)", + "name": "23.0.1.9000 (dev)", "version": "dev/" }, { - "name": "23.0.0 (release)", + "name": "23.0.1 (release)", "version": "" }, { diff --git a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index 4cef86c65fa..c18ab9ac467 100644 --- a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowCUDA - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb index 3b0c83b3c8d..f2333c67c63 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowDataset - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb index 4337f4bc1c7..891db90d732 100644 --- a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb +++ b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlightSQL - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/ruby/red-arrow-flight/lib/arrow-flight/version.rb index 69fcc9e667b..25063a59d99 100644 --- a/ruby/red-arrow-flight/lib/arrow-flight/version.rb +++ b/ruby/red-arrow-flight/lib/arrow-flight/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlight - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-format/lib/arrow-format/version.rb b/ruby/red-arrow-format/lib/arrow-format/version.rb index 0d1bb36ce1f..6fccd13e71b 100644 --- a/ruby/red-arrow-format/lib/arrow-format/version.rb +++ b/ruby/red-arrow-format/lib/arrow-format/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFormat - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow/lib/arrow/version.rb b/ruby/red-arrow/lib/arrow/version.rb index 1f74a5960af..9a94c971989 100644 --- a/ruby/red-arrow/lib/arrow/version.rb +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -16,7 +16,7 @@ # under the License. module Arrow - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb index afef421030e..f958522a08f 100644 --- a/ruby/red-gandiva/lib/gandiva/version.rb +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -16,7 +16,7 @@ # under the License. module Gandiva - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb index ccce6defe4c..325c045a8f7 100644 --- a/ruby/red-parquet/lib/parquet/version.rb +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -16,7 +16,7 @@ # under the License. module Parquet - VERSION = "23.0.0" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-")