diff --git a/.env b/.env index dad867f8f66..14ed93bfe9b 100644 --- a/.env +++ b/.env @@ -52,7 +52,7 @@ ULIMIT_CORE=-1 # Default versions for platforms ALMALINUX=8 ALPINE_LINUX=3.22 -DEBIAN=12 +DEBIAN=13 FEDORA=42 UBUNTU=22.04 @@ -61,11 +61,9 @@ CLANG_TOOLS=18 CMAKE=3.26.0 CUDA=11.7.1 DASK=latest -DOTNET=8.0 GCC= HDFS=3.2.1 JDK=11 -KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=18 MAVEN=3.8.7 @@ -79,7 +77,6 @@ PYTHON_IMAGE_TAG=3.10 PYTHON_ABI_TAG=cp310 R=4.5 SPARK=master -TURBODBC=latest # These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-release:latest R_IMAGE=ubuntu-release @@ -102,8 +99,8 @@ VCPKG="4334d8b4c8916018600212ab4dd4bbdc343065d1" # 2025.09.17 Release # ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-10-13 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-10-13 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2026-01-27 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2026-01-27 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 7ab4c73270d..7844b0b0112 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -39,6 +39,7 @@ on: - 'ci/scripts/util_*' - 'cpp/**' - 'compose.yaml' + - 'dev/archery/archery/**' - 'format/Flight.proto' - 'testing' tags: @@ -61,6 +62,7 @@ on: - 'ci/scripts/util_*' - 'cpp/**' - 'compose.yaml' + - 'dev/archery/archery/**' - 'format/Flight.proto' - 'testing' types: diff --git a/.github/workflows/cuda_extra.yml b/.github/workflows/cuda_extra.yml new file mode 100644 index 00000000000..1700d6a8456 --- /dev/null +++ b/.github/workflows/cuda_extra.yml @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: CUDA Extra + +on: + push: + tags: + - '**' + pull_request: + types: + - labeled + - opened + - reopened + - synchronize + schedule: + - cron: | + 0 6 * * * + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + actions: read + contents: read + pull-requests: read + +jobs: + check-labels: + if: github.event_name != 'schedule' || github.repository == 'apache/arrow' + uses: ./.github/workflows/check_labels.yml + secrets: inherit + with: + parent-workflow: cuda_extra + + docker: + needs: check-labels + name: ${{ matrix.title }} + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=capacity-optimized" + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: CUDA') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + include: + - cuda: 12.9.0 + ubuntu: 24.04 + image: ubuntu-cuda-cpp + title: AMD64 Ubuntu 24 CUDA 12.9.0 + - cuda: 11.7.1 + ubuntu: 22.04 + image: ubuntu-cuda-cpp + title: AMD64 Ubuntu 22 CUDA 11.7.1 + - cuda: 12.9.0 + ubuntu: 24.04 + image: ubuntu-cuda-python + title: AMD64 Ubuntu 24 CUDA 12.9.0 Python + - cuda: 11.7.1 + ubuntu: 22.04 + image: ubuntu-cuda-python + title: AMD64 Ubuntu 22 CUDA 11.7.1 Python + env: + ARCHERY_DEBUG: 1 + ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" + steps: + - name: Checkout Arrow + uses: actions/checkout@v6 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@v5 + with: + path: .docker + key: extra-${{ matrix.image }}-${{ hashFiles('cpp/**') }} + restore-keys: extra-${{ matrix.image }}- + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[docker] + - name: Display NVIDIA SMI details + run: | + nvidia-smi + nvidia-smi -L + nvidia-smi -q -d Memory + - name: Execute Docker Build + continue-on-error: ${{ matrix.continue-on-error || false }} + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + CUDA: ${{ matrix.cuda }} + UBUNTU: ${{ matrix.ubuntu }} + run: | + # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes + sudo sysctl -w vm.mmap_rnd_bits=28 + source ci/scripts/util_enable_core_dumps.sh + archery docker run ${{ matrix.run-options || '' }} ${{ matrix.image }} + - name: Docker Push + if: >- + success() && + github.event_name == 'push' && + github.repository == 'apache/arrow' && + github.ref_name == 'main' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + run: archery docker push ${{ matrix.image }} + + report-extra-cpp: + if: github.event_name == 'schedule' && always() + needs: + - docker + uses: ./.github/workflows/report_ci.yml + secrets: inherit diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml index 3e4b7592153..4dc9a70e879 100644 --- a/.github/workflows/package_linux.yml +++ b/.github/workflows/package_linux.yml @@ -29,6 +29,7 @@ on: - '.github/workflows/report_ci.yml' - 'cpp/**' - 'c_glib/**' + - 'dev/archery/archery/**' - 'dev/release/binary-task.rb' - 'dev/release/verify-apt.sh' - 'dev/release/verify-yum.sh' @@ -43,6 +44,7 @@ on: - '.github/workflows/report_ci.yml' - 'cpp/**' - 'c_glib/**' + - 'dev/archery/archery/**' - 'dev/release/binary-task.rb' - 'dev/release/verify-apt.sh' - 'dev/release/verify-yum.sh' @@ -230,6 +232,8 @@ jobs: ${GITHUB_REF_NAME} \ release_candidate.yml - name: Build + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | pushd dev/tasks/linux-packages rake docker:pull || : diff --git a/.github/workflows/r_extra.yml b/.github/workflows/r_extra.yml index 687a4e0aa05..443d2354d7f 100644 --- a/.github/workflows/r_extra.yml +++ b/.github/workflows/r_extra.yml @@ -27,15 +27,16 @@ on: - '.github/workflows/check_labels.yml' - '.github/workflows/r_extra.yml' - '.github/workflows/report_ci.yml' - - "ci/docker/**" - - "ci/etc/rprofile" - - "ci/scripts/PKGBUILD" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/install_minio.sh" - - "ci/scripts/r_*.sh" - - "cpp/**" - - "compose.yaml" - - "r/**" + - 'ci/docker/**' + - 'ci/etc/rprofile' + - 'ci/scripts/PKGBUILD' + - 'ci/scripts/cpp_*.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/r_*.sh' + - 'cpp/**' + - 'compose.yaml' + - 'dev/archery/archery/**' + - 'r/**' tags: - '**' pull_request: @@ -44,15 +45,16 @@ on: - '.github/workflows/check_labels.yml' - '.github/workflows/r_extra.yml' - '.github/workflows/report_ci.yml' - - "ci/docker/**" - - "ci/etc/rprofile" - - "ci/scripts/PKGBUILD" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/install_minio.sh" - - "ci/scripts/r_*.sh" - - "cpp/**" - - "compose.yaml" - - "r/**" + - 'ci/docker/**' + - 'ci/etc/rprofile' + - 'ci/scripts/PKGBUILD' + - 'ci/scripts/cpp_*.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/r_*.sh' + - 'cpp/**' + - 'compose.yaml' + - 'dev/archery/archery/**' + - 'r/**' types: - labeled - opened diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml index e4849beeeb9..57620831bc5 100644 --- a/.github/workflows/release_candidate.yml +++ b/.github/workflows/release_candidate.yml @@ -25,12 +25,12 @@ on: tags: - "apache-arrow-*-rc*" paths: - - ".github/workflows/release_candidate.sh" + - ".github/workflows/release_candidate.yml" - "dev/release/utils-create-release-tarball.sh" - "dev/release/utils-generate-checksum.sh" pull_request: paths: - - ".github/workflows/release_candidate.sh" + - ".github/workflows/release_candidate.yml" - "dev/release/utils-create-release-tarball.sh" - "dev/release/utils-generate-checksum.sh" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 186277edf40..da84abed0d9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -353,7 +353,7 @@ repos: ?^cpp/examples/minimal_build/run\.sh$| ?^cpp/examples/tutorial_examples/run\.sh$| ?^dev/release/05-binary-upload\.sh$| - ?^dev/release/07-binary-verify\.sh$| + ?^dev/release/08-binary-verify\.sh$| ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| ?^dev/release/post-08-docs\.sh$| diff --git a/CHANGELOG.md b/CHANGELOG.md index 6101f5d3cac..3e46901c999 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,394 @@ +# Apache Arrow 23.0.1 (2026-02-08 00:00:00+00:00) + +## Bug Fixes + +* [GH-48160](https://github.com/apache/arrow/issues/48160) - [C++][Gandiva] Pass CPU attributes to LLVM (#48161) +* [GH-48311](https://github.com/apache/arrow/issues/48311) - [C++] Fix OOB memory access in buffered IO (#48322) +* [GH-48637](https://github.com/apache/arrow/issues/48637) - [C++][FlightRPC] ODBC: Disable `absl` deadlock detection (#48747) +* [GH-48856](https://github.com/apache/arrow/issues/48856) - [Release] Update copyright NOTICE year to 2026 (#48857) +* [GH-48858](https://github.com/apache/arrow/issues/48858) - [C++][Parquet] Avoid re-serializing footer for signature verification (#48859) +* [GH-48861](https://github.com/apache/arrow/issues/48861) - [CI] Fix wrong `smtplib.SMTP.send_message` usage (#48876) +* [GH-48880](https://github.com/apache/arrow/issues/48880) - [Ruby] Fix a bug that Arrow::ExecutePlan nodes may be GC-ed (#48919) +* [GH-48885](https://github.com/apache/arrow/issues/48885) - [C++] Add missing curl dependency of `Arrow::arrow_static` CMake target (#48891) +* [GH-48894](https://github.com/apache/arrow/issues/48894) - [Python][C++] Use base Azure::Core::RequestFailedException instead of final Azure::Storage::StorageException and set minimum nodejs on conda env to 16 for Azurite to work (#48895) +* [GH-48900](https://github.com/apache/arrow/issues/48900) - [C++] Avoid memory blowup with excessive variadic buffer count in IPC (#48901) +* [GH-48961](https://github.com/apache/arrow/issues/48961) - [Docs][Python] Doctest fails on pandas 3.0 +* [GH-48965](https://github.com/apache/arrow/issues/48965) - [Python][C++] Compare unique_ptr for CFlightResult or CFlightInfo to nullptr instead of NULL (#48968) +* [GH-48966](https://github.com/apache/arrow/issues/48966) - [C++] Fix cookie duplication in the Flight SQL ODBC driver and the Flight Client (#48967) +* [GH-48983](https://github.com/apache/arrow/issues/48983) - [Packaging][Python] Build wheel from sdist using build and add check to validate LICENSE.txt and NOTICE.txt are part of the wheel contents (#48988) +* [GH-49003](https://github.com/apache/arrow/issues/49003) - [C++] Don't consider `out_of_range` an error in float parsing (#49095) +* [GH-49044](https://github.com/apache/arrow/issues/49044) - [CI][Python] Fix test_download_tzdata_on_windows by adding required user-agent on urllib request (#49052) +* [GH-49059](https://github.com/apache/arrow/issues/49059) - [C++] Fix issues found by OSS-Fuzz in IPC reader (#49060) +* [GH-49137](https://github.com/apache/arrow/issues/49137) - [CI][Release] macOS conda source verification jobs fail to build Arrow C++ +* [GH-49138](https://github.com/apache/arrow/issues/49138) - [Packaging][Python] Remove nightly cython install from manylinux wheel dockerfile (#49139) +* [GH-49156](https://github.com/apache/arrow/issues/49156) - [Python] Require GIL for string comparison (#49161) +* [GH-49159](https://github.com/apache/arrow/issues/49159) - [C++][Gandiva] Detect overflow in repeat() (#49160) + + +## New Features and Improvements + +* [GH-48623](https://github.com/apache/arrow/issues/48623) - [CI][Archery][Dev] Add missing headers to email reports (#48624) +* [GH-48817](https://github.com/apache/arrow/issues/48817) - [R][C++] Bump C++20 in R build infrastructure (#48819) +* [GH-48844](https://github.com/apache/arrow/issues/48844) - [C++] Check IPC Message body length consistency in IPC file (#48845) +* [GH-48924](https://github.com/apache/arrow/issues/48924) - [C++][CI] Fix pre-buffering issues in IPC file reader (#48925) +* [GH-48973](https://github.com/apache/arrow/issues/48973) - [R][C++] Fix RE2 compilation errors under C++20 (#48976) +* [GH-49024](https://github.com/apache/arrow/issues/49024) - [CI] Update Debian version in `.env` (#49032) + + + +# Apache Arrow 23.0.0 (2026-01-12 00:00:00+00:00) + +## Bug Fixes + +* [GH-33473](https://github.com/apache/arrow/issues/33473) - [Python] Fix KeyError on Pandas roundtrip with RangeIndex in MultiIndex (#39983) +* [GH-35957](https://github.com/apache/arrow/issues/35957) - [C++][Compute] Graceful error for decimal binary arithmetic and comparison instead of firing confusing assertion (#48639) +* [GH-41246](https://github.com/apache/arrow/issues/41246) - [C++][Python] Simplify nested field encryption configuration (#45462) +* [GH-42173](https://github.com/apache/arrow/issues/42173) - [R][C++] Writing partitioned dataset on S3 fails if ListBucket is not allowed for the user (#47599) +* [GH-43660](https://github.com/apache/arrow/issues/43660) - [C++][Compute] Avoid ZeroCopyCastExec when casting Binary offset -> Binary offset types (#48171) +* [GH-44318](https://github.com/apache/arrow/issues/44318) - [C++][Python] Fix RecordBatch::FromStructArray for sliced arrays with offset = 0 (#47843) +* [GH-45260](https://github.com/apache/arrow/issues/45260) - [R][Docs] Improve documentation on GCS support +* [GH-45867](https://github.com/apache/arrow/issues/45867) - [Python] Fix `SetuptoolsDeprecationWarning` (#47141) +* [GH-46063](https://github.com/apache/arrow/issues/46063) - [C++][Compute] Fix the issue that MinMax kernel emits -inf/inf for all-NaN input (#48459) +* [GH-46584](https://github.com/apache/arrow/issues/46584) - [C++][FlightRPC] Iterate over endpoints in ODBC driver (#47991) +* [GH-47000](https://github.com/apache/arrow/issues/47000) - [R] concat_tables on a record_batch causes segfault (#47885) +* [GH-47022](https://github.com/apache/arrow/issues/47022) - [Python] Support unsigned dictionary indices in pandas conversion (#48451) +* [GH-47099](https://github.com/apache/arrow/issues/47099) - [C++][Parquet] Add missing `pragma warning(pop)` to `parquet/platform.h` (#47114) +* [GH-47371](https://github.com/apache/arrow/issues/47371) - , GH-48281: [Python][CI] Fix Numba-CUDA interop (#48284) +* [GH-47559](https://github.com/apache/arrow/issues/47559) - [Python] Fix missing argument in pyarrow fs (#47497) +* [GH-47564](https://github.com/apache/arrow/issues/47564) - [C++] Update expected L2 CPU cache range to 32KiB-64MiB (#47563) +* [GH-47664](https://github.com/apache/arrow/issues/47664) - [C++][Parquet] add num_rows_ before each call to RowGroupWriter::Close in FileSerializer (#47665) +* [GH-47734](https://github.com/apache/arrow/issues/47734) - [Python] Fix hypothesis timedelta bounds for duration/interval types (#48460) +* [GH-47751](https://github.com/apache/arrow/issues/47751) - [CI] Fix check for job to ignore on reporting (#47755) +* [GH-47778](https://github.com/apache/arrow/issues/47778) - [CI][Python] Remove ORC alias timezone for US/Pacific on test_orc.py::test_timezone_absent (#47956) +* [GH-47781](https://github.com/apache/arrow/issues/47781) - [C++] Cleaned up type-limit warning in sink_node.cc (#47782) +* [GH-47807](https://github.com/apache/arrow/issues/47807) - [C++][Compute] Fix the issue that null count is not updated when setting slice on an array span (#47808) +* [GH-47812](https://github.com/apache/arrow/issues/47812) - [R][CI] Fix lint for new version of styler (#47813) +* [GH-47821](https://github.com/apache/arrow/issues/47821) - [CI][Release][R] Fix test repository path in release (#47929) +* [GH-47823](https://github.com/apache/arrow/issues/47823) - [Python] Use PyWeakref_GetRef instead of PyWeakref_GET_OBJECT (Python 3.15) (#48027) +* [GH-47825](https://github.com/apache/arrow/issues/47825) - [C++] Fix the issue that bitmap ops overriding partial leading byte (#47912) +* [GH-47830](https://github.com/apache/arrow/issues/47830) - [Release] Run RC verification source testing step in a subshell (#47831) +* [GH-47836](https://github.com/apache/arrow/issues/47836) - [C++] Fix Meson configuration after bpacking changes (#47837) +* [GH-47840](https://github.com/apache/arrow/issues/47840) - [CI][C++] Check whether the CSV module/thread sanitizer is enabled or not before building example (#47841) +* [GH-47844](https://github.com/apache/arrow/issues/47844) - [CI] Fix unconditionally running extra workflows reporting when there are jobs failing (#47917) +* [GH-47859](https://github.com/apache/arrow/issues/47859) - [C++] Fix creating union types without type_codes for fields.size() == 128 (#47815) +* [GH-47861](https://github.com/apache/arrow/issues/47861) - [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860) +* [GH-47883](https://github.com/apache/arrow/issues/47883) - [CI] Add openssl gem explicitly to fix ceriticate validation error on test (#47884) +* [GH-47909](https://github.com/apache/arrow/issues/47909) - [C++] Fix MSVC ARM64 build (#47910) +* [GH-47914](https://github.com/apache/arrow/issues/47914) - [C++] Fix system Apache ORC/Google logging used detection (#47915) +* [GH-47918](https://github.com/apache/arrow/issues/47918) - [Format] Clarify that empty compressed buffers can omit the length header (#48541) +* [GH-47919](https://github.com/apache/arrow/issues/47919) - [C++] Update Meson config for C Data Interface changes (#47920) +* [GH-47921](https://github.com/apache/arrow/issues/47921) - [C++] Implement substrait option in Meson (#48016) +* [GH-47923](https://github.com/apache/arrow/issues/47923) - [CI] Use macos-15-intel instead of macos-13 for macOS x86 runner (#47690) +* [GH-47924](https://github.com/apache/arrow/issues/47924) - [C++] Fix issues in CSV reader with invalid inputs (#47925) +* [GH-47927](https://github.com/apache/arrow/issues/47927) - [Release] Fix APT repository metadata generation with new repository (#47928) +* [GH-47932](https://github.com/apache/arrow/issues/47932) - [Release][Python] PyPI rejects our source distribution due to missing LICENSE.txt +* [GH-47933](https://github.com/apache/arrow/issues/47933) - [Release][R] Don't upload *.sha512.{asc,sha512} (#47982) +* [GH-47941](https://github.com/apache/arrow/issues/47941) - [R] Fix codegen.R error from dplyr pipe to base pipe change (#47985) +* [GH-47942](https://github.com/apache/arrow/issues/47942) - [R] CRAN 22.0.0 R package release fails on Winbuilder due to "non-API call to R: 'Rf_lazy_duplicate'" (#47943) +* [GH-47945](https://github.com/apache/arrow/issues/47945) - [C++] Add support for Boost 1.89.0 and require Boost 1.69 or later (#47947) +* [GH-47948](https://github.com/apache/arrow/issues/47948) - [CI][Packaging][Deb] Add missing directory existent check (#47949) +* [GH-47953](https://github.com/apache/arrow/issues/47953) - [C++] Remove Windows inclusion from `int_util_overflow.h` (#47950) +* [GH-47955](https://github.com/apache/arrow/issues/47955) - [C++][Parquet] Support reading INT-encoded Decimal stats as Arrow scalar (#48001) +* [GH-47961](https://github.com/apache/arrow/issues/47961) - [C++] Fix Meson's Boost process version detection (#48017) +* [GH-47964](https://github.com/apache/arrow/issues/47964) - [Docs] Add dcleblanc/SafeInt to the LICENSE.txt file (#47965) +* [GH-47966](https://github.com/apache/arrow/issues/47966) - [Python] PyArrow v22.0 assumes Pandas DataFrame attrs are serializable (#47977) +* [GH-47967](https://github.com/apache/arrow/issues/47967) - [C++] Update Meson Configuration with SafeInt Changes (#47968) +* [GH-47970](https://github.com/apache/arrow/issues/47970) - [CI][C++] Fix a bug that JNI jobs runs nothing (#47972) +* [GH-47973](https://github.com/apache/arrow/issues/47973) - [C++][Parquet] Fix invalid Parquet files written when dictionary encoded pages are large (#47998) +* [GH-47981](https://github.com/apache/arrow/issues/47981) - [C++][Parquet] Add compatibility with non-compliant RLE stream (#47992) +* [GH-47983](https://github.com/apache/arrow/issues/47983) - [CI][R] R nightly upload workflow failing for a few weeks (#47984) +* [GH-48004](https://github.com/apache/arrow/issues/48004) - [C++][Parquet] Fix hang in ColumnReader benchmark (#48005) +* [GH-48010](https://github.com/apache/arrow/issues/48010) - [C++] Update bundled RE2 from 2022-06-01 to 2023-03-01 (#48011) +* [GH-48029](https://github.com/apache/arrow/issues/48029) - [R][CI] R nightly upload workflow failing in pruning step (#48030) +* [GH-48044](https://github.com/apache/arrow/issues/48044) - [Packaging][RPM][Parquet] Don't install `parquet-glib.pc` by `parquet-devel` (#48045) +* [GH-48046](https://github.com/apache/arrow/issues/48046) - [Docs][C++] Clarify "Exporting Tracing Information" section in OTel docs (#48047) +* [GH-48057](https://github.com/apache/arrow/issues/48057) - [R] Slow reading performance caused by apply_arrow_r_metadata() looping through all columns, including NULL ones (#48104) +* [GH-48062](https://github.com/apache/arrow/issues/48062) - [C++] Fix null pointer dereference in MakeExecBatch (#48063) +* [GH-48064](https://github.com/apache/arrow/issues/48064) - [C++] Set ARROW_BUILD_STATIC=ON when features-flight are enabled on CMake presets (#48065) +* [GH-48076](https://github.com/apache/arrow/issues/48076) - [C++][Flight] fix GeneratorStream for Tables (#48082) +* [GH-48079](https://github.com/apache/arrow/issues/48079) - [CI] Fix a typo in util_free_space.sh (#48088) +* [GH-48095](https://github.com/apache/arrow/issues/48095) - [Python][Docs] Add missing {pyarrow,compute} functions to API docs (#48117) +* [GH-48098](https://github.com/apache/arrow/issues/48098) - [R] Fix nightly libarrow binary uploads (#48100) +* [GH-48107](https://github.com/apache/arrow/issues/48107) - [CI] Update testing submodule (#48114) +* [GH-48115](https://github.com/apache/arrow/issues/48115) - [C++] Better align Meson configuration and config.h (#48116) +* [GH-48125](https://github.com/apache/arrow/issues/48125) - [C++] Remove gnu11 standard from the Meson configuration (#48126) +* [GH-48127](https://github.com/apache/arrow/issues/48127) - [R] stringr argument deprecation - add binding for stringr::str_ilike() and remove ignore_case argument for stringr::str_like() (#48262) +* [GH-48129](https://github.com/apache/arrow/issues/48129) - [CI] Stale issues bot only looks at 30 issues at a time (#48130) +* [GH-48134](https://github.com/apache/arrow/issues/48134) - [C++] Make StructArray::field() thread-safe (#48128) +* [GH-48142](https://github.com/apache/arrow/issues/48142) - [CI] Disallow scheduled GitHub Actions run on forked repos (#48143) +* [GH-48146](https://github.com/apache/arrow/issues/48146) - [C++][Parquet] Fix undefined behavior with invalid column/offset index (#48147) +* [GH-48162](https://github.com/apache/arrow/issues/48162) - [CI] Stale issues bot hit secondary rate limit and did not complete (#48165) +* [GH-48168](https://github.com/apache/arrow/issues/48168) - [C++][Parquet] Fix setting column-specific options when writing an encrypted Dataset (#48170) +* [GH-48234](https://github.com/apache/arrow/issues/48234) - [C++][Parquet] Fix overly strict check for BIT_PACKED levels byte size (#48235) +* [GH-48238](https://github.com/apache/arrow/issues/48238) - [C++] Actually write IPC schema endianness, not host endianness (#48239) +* [GH-48246](https://github.com/apache/arrow/issues/48246) - [C++][Parquet] Fix pre-1970 INT96 timestamps roundtrip (#48247) +* [GH-48263](https://github.com/apache/arrow/issues/48263) - [CI] Stale issues workflow doesn't go through enough issues (#48264) +* [GH-48268](https://github.com/apache/arrow/issues/48268) - [C++][Acero] Enhance the type checking for hash join residual filter (#48272) +* [GH-48280](https://github.com/apache/arrow/issues/48280) - [CI] PYTHON_PATCH_VERSION docker warnings (#48282) +* [GH-48283](https://github.com/apache/arrow/issues/48283) - [R][CI] Failures on R Lint on main (#48286) +* [GH-48308](https://github.com/apache/arrow/issues/48308) - [C++][Parquet] Fix potential crash when reading invalid Parquet data (#48309) +* [GH-48314](https://github.com/apache/arrow/issues/48314) - [Python] Compat with pandas 3.0 changed default datetime unit (#48319) +* [GH-48340](https://github.com/apache/arrow/issues/48340) - [R] respected `MAKEFLAGS` (#48341) +* [GH-48376](https://github.com/apache/arrow/issues/48376) - [C++] Update GoogleTest from 1.16.0 to 1.17.0 (#48377) +* [GH-48416](https://github.com/apache/arrow/issues/48416) - [Packaging][CI] Use custom orc_for_bundling when using FetchContent to avoid ar issues with + symbol on path (#48430) +* [GH-48417](https://github.com/apache/arrow/issues/48417) - [Packaging][CI] Skip downgrade testing for Debian testing (#48427) +* [GH-48432](https://github.com/apache/arrow/issues/48432) - [CI][Ruby] Don't run Red Arrow Format tests with Ruby 3.1 (#48434) +* [GH-48478](https://github.com/apache/arrow/issues/48478) - [Ruby] Fix Ruby list inference for nested non-negative integer arrays (#48584) +* [GH-48481](https://github.com/apache/arrow/issues/48481) - [Ruby] Correctly infer types for nested integer arrays (#48699) +* [GH-48540](https://github.com/apache/arrow/issues/48540) - [Python][C++][CI] test\_s3\_options crash on macOS +* [GH-48566](https://github.com/apache/arrow/issues/48566) - [C++][CI] Fix compilation on Valgrind job (#48567) +* [GH-48570](https://github.com/apache/arrow/issues/48570) - [C++] Add Missing Fuzz Sources to Meson configuration (#48571) +* [GH-48608](https://github.com/apache/arrow/issues/48608) - [Python] Fix interpolate actual values in Message.__repr__ f-string (#48656) +* [GH-48610](https://github.com/apache/arrow/issues/48610) - [Ruby] Add FixedSizeListArray glue (#48609) +* [GH-48625](https://github.com/apache/arrow/issues/48625) - [Python] Add temporal unit checking in NumPyDtypeUnifier (#48626) +* [GH-48641](https://github.com/apache/arrow/issues/48641) - [CI] Multiple nightly R builds failing due to ssache errors +* [GH-48725](https://github.com/apache/arrow/issues/48725) - [C++] Fix bundled Protobuf doesn't exist in libarrow_bundled_dependencies (#48726) +* [GH-48735](https://github.com/apache/arrow/issues/48735) - [CI][Python] Fix macOS wheel builds by forcing setuptools upgrade in venv (#48739) +* [GH-48736](https://github.com/apache/arrow/issues/48736) - [CI][Python] Restore AlmaLinux 8 support of `dev/release/setup-rhel-rebuilds.sh` for wheel verification (#48748) +* [GH-48741](https://github.com/apache/arrow/issues/48741) - [C++] Fix deadlock in CSV AsyncThreadedTableReader destructor (#48742) +* [GH-48750](https://github.com/apache/arrow/issues/48750) - [CI][Documentation] Disable Unity build for OpenTelemetry (#48751) +* [GH-48776](https://github.com/apache/arrow/issues/48776) - [CI][Ruby][Windows] Ensure removing temporary files (#48777) +* [GH-48780](https://github.com/apache/arrow/issues/48780) - [CI] Add missing permissions for reusable workflow calls (#48778) +* [GH-48782](https://github.com/apache/arrow/issues/48782) - [Docs][CI] Skip Markdown files with doxygen and trigger Docs job on PR when files are modified (#48786) +* [GH-48784](https://github.com/apache/arrow/issues/48784) - [GLib] Make (system) Parquet C++ is optional (#48785) +* [GH-48787](https://github.com/apache/arrow/issues/48787) - [C++] Disable `-Werror` for s2n-tls (#48791) +* [GH-48806](https://github.com/apache/arrow/issues/48806) - [CI][Packaging] ubuntu-noble-arm64 has failes for several days due to network failure (403 Forbidden [IP: 91.189.92.19 80]) +* [GH-48807](https://github.com/apache/arrow/issues/48807) - [CI] Clean up space on GitHub runner to fix manylinux wheel failure (#48790) +* [GH-48809](https://github.com/apache/arrow/issues/48809) - [CI] Fix homebrew-cpp with Mac by using formula-based dependency resolution (#48824) +* [GH-48811](https://github.com/apache/arrow/issues/48811) - [C++][FlightRPC] ODBC: Add missing `arrow::` to fix build (#48810) +* [GH-48827](https://github.com/apache/arrow/issues/48827) - [CI][Python] Add required xz dependency to emscripten dockerfile (#48828) +* [GH-48838](https://github.com/apache/arrow/issues/48838) - [Release] Use gh cli to download sources for Linux packages and publish draft release before verification (#48839) +* [GH-48841](https://github.com/apache/arrow/issues/48841) - [Release][Package] Add GH_TOKEN to rake build step on Linux Packaging jobs (#48842) + + +## New Features and Improvements + +* [GH-23970](https://github.com/apache/arrow/issues/23970) - [GLib] Add support for duration (#48564) +* [GH-24157](https://github.com/apache/arrow/issues/24157) - [C++] Add tests for DayTimeIntervalBuilder (#48709) +* [GH-31869](https://github.com/apache/arrow/issues/31869) - [Python][Parquet] Implement external key material features in Python (#48009) +* [GH-40735](https://github.com/apache/arrow/issues/40735) - [Packaging][CentOS] Drop support for CentOS 7 (#48550) +* [GH-41364](https://github.com/apache/arrow/issues/41364) - [GLib][Ruby] Allow passing thread pool to ExecutePlan (#48462) +* [GH-44810](https://github.com/apache/arrow/issues/44810) - [C++][Parquet] Add arrow::Result version of parquet::arrow::FileReader::Make() (#48285) +* [GH-45449](https://github.com/apache/arrow/issues/45449) - [R][CI] Remove OpenSSL 1.x builds (#48297) +* [GH-45484](https://github.com/apache/arrow/issues/45484) - [C++] Drop support for the gold linker (#47780) +* [GH-45885](https://github.com/apache/arrow/issues/45885) - [C++] Require C++20 (#48414) +* [GH-46004](https://github.com/apache/arrow/issues/46004) - [C++][FlightRPC] Enable ODBC Build In C++ Workflows (#47689) +* [GH-46096](https://github.com/apache/arrow/issues/46096) - [C++][FlightRPC] Environment and Connection Handle Allocation (#47759) +* [GH-46098](https://github.com/apache/arrow/issues/46098) - [C++][FlightRPC] ODBC Environment Attribute Implementation (#47760) +* [GH-46147](https://github.com/apache/arrow/issues/46147) - [C++] Implement GCS support in Meson (#47568) +* [GH-46411](https://github.com/apache/arrow/issues/46411) - [C++] Implemented dataset option in Meson (#47669) +* [GH-46465](https://github.com/apache/arrow/issues/46465) - [C++][FlightRPC] Refactor ODBC namespaces and file structure (#47703) +* [GH-46574](https://github.com/apache/arrow/issues/46574) - [C++][FlightRPC] ODBC Driver Connectivity support (#47971) +* [GH-46575](https://github.com/apache/arrow/issues/46575) - [C++][FlightRPC] Add Diagnostic tests (#47764) +* [GH-46575](https://github.com/apache/arrow/issues/46575) - [C++][FlightRPC] ODBC Diagnostics Report (#47763) +* [GH-46592](https://github.com/apache/arrow/issues/46592) - [CI][Dev][R] Add Air to pre-commit (#47423) +* [GH-46825](https://github.com/apache/arrow/issues/46825) - [R] Use smallest_decimal() from C++ instead of working out which decimal type to instantiate in R (#47906) +* [GH-46903](https://github.com/apache/arrow/issues/46903) - [CI] Automatically flag stale issues (#46904) +* [GH-47030](https://github.com/apache/arrow/issues/47030) - [C++][Parquet] Add setting to limit the number of rows written per page (#47090) +* [GH-47103](https://github.com/apache/arrow/issues/47103) - [Statistics][C++] Implement Statistics specification attribute ARROW:null_count:approximate (#47969) +* [GH-47105](https://github.com/apache/arrow/issues/47105) - [Statistics][C++] Implement Statistics specification attribute ARROW:row_count:approximate (#48266) +* [GH-47196](https://github.com/apache/arrow/issues/47196) - [CI][C++] Add Windows ARM64 build (#47811) +* [GH-47437](https://github.com/apache/arrow/issues/47437) - [CI][Python] Update win wheels and free-threaded build for Python 3.14 +* [GH-47441](https://github.com/apache/arrow/issues/47441) - [Python][Parquet] Allow passing write_time_adjusted_to_utc to Python's ParquetWriter (#47745) +* [GH-47572](https://github.com/apache/arrow/issues/47572) - [C++][Parquet] Uniform unpack interface (#47573) +* [GH-47635](https://github.com/apache/arrow/issues/47635) - [CI][Integration] Add new gold files (#47729) +* [GH-47640](https://github.com/apache/arrow/issues/47640) - [CI] Remove needless ci/docker/ubuntu-22.04-csharp.dockerfile (#48298) +* [GH-47643](https://github.com/apache/arrow/issues/47643) - [Python][Packaging] Enable CMAKE_INTERPROCEDURAL_OPTIMIZATION for wheels (#47733) +* [GH-47677](https://github.com/apache/arrow/issues/47677) - [C++][GPU] Allow building with CUDA 13 (#48259) +* [GH-47697](https://github.com/apache/arrow/issues/47697) - [C++][FlightRPC] Add ODBC API placeholders (#47725) +* [GH-47706](https://github.com/apache/arrow/issues/47706) - [C++][FlightRPC] ODBC SQLFreeStmt implementation (#48033) +* [GH-47707](https://github.com/apache/arrow/issues/47707) - [C++][FlightRPC] Add tests for descriptor handle allocation (#48053) +* [GH-47708](https://github.com/apache/arrow/issues/47708) - [C++][FlightRPC] Connection Attribute Support for ODBC (#47772) +* [GH-47710](https://github.com/apache/arrow/issues/47710) - [C++][FlightRPC] Statement attribute Support in ODBC (#47773) +* [GH-47711](https://github.com/apache/arrow/issues/47711) - [C++][FlightRPC] Enable ODBC query execution (#48032) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC SQLMoreResults implementation (#48035) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC return number of result columns (#48036) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC return number of affected rows (#48037) +* [GH-47713](https://github.com/apache/arrow/issues/47713) - [C++][FlightRPC] ODBC Basic Data Retrieval (#48034) +* [GH-47714](https://github.com/apache/arrow/issues/47714) - [C++][FlightRPC] ODBC extended fetch (#48040) +* [GH-47715](https://github.com/apache/arrow/issues/47715) - [C++][FlightRPC] ODBC scroll fetch implementation (#48041) +* [GH-47716](https://github.com/apache/arrow/issues/47716) - [C++][FlightRPC] ODBC bind column implementation (#48042) +* [GH-47717](https://github.com/apache/arrow/issues/47717) - [C++][FlightRPC] ODBC close cursor (#48043) +* [GH-47719](https://github.com/apache/arrow/issues/47719) - [C++][FlightRPC] Extract SQLTables Implementation (#48021) +* [GH-47720](https://github.com/apache/arrow/issues/47720) - [C++][FlightRPC] ODBC Columns Metadata (#48049) +* [GH-47721](https://github.com/apache/arrow/issues/47721) - [C++][FlightRPC] Followup to remove unncessary std::move to resolve compliation flakiness (#48687) +* [GH-47721](https://github.com/apache/arrow/issues/47721) - [C++][FlightRPC] Return ODBC Column Attribute from result set (#48050) +* [GH-47722](https://github.com/apache/arrow/issues/47722) - [C++][FlightRPC] ODBC Data Type Information (#48051) +* [GH-47723](https://github.com/apache/arrow/issues/47723) - [C++][FlightRPC] ODBC SQLNativeSQL implementation (#48020) +* [GH-47724](https://github.com/apache/arrow/issues/47724) - [C++][FlightRPC] ODBC: implement SQLDescribeCol (#48052) +* [GH-47726](https://github.com/apache/arrow/issues/47726) - [C++][FlightRPC] ODBC Unicode Support (#47771) +* [GH-47728](https://github.com/apache/arrow/issues/47728) - [Python] Check the source argument in parquet.read_table (#48008) +* [GH-47747](https://github.com/apache/arrow/issues/47747) - [C++] Bump Apache ORC to 2.2.1 (#47744) +* [GH-47753](https://github.com/apache/arrow/issues/47753) - [C++][Parquet] Build Thrift with OpenSSL disabled (#47754) +* [GH-47756](https://github.com/apache/arrow/issues/47756) - [C++][CI] Fuzz CSV reader (#47757) +* [GH-47767](https://github.com/apache/arrow/issues/47767) - [CI] Add date to extra CI report email subject (#47777) +* [GH-47784](https://github.com/apache/arrow/issues/47784) - [C++] Patch vendored pcg library to enable msvc arm64 intrinsics (#47779) +* [GH-47786](https://github.com/apache/arrow/issues/47786) - [C++][FlightRPC] Establish ODBC tests (#47788) +* [GH-47787](https://github.com/apache/arrow/issues/47787) - [C++][FlightRPC] ODBC `msi` Windows installer (#48054) +* [GH-47789](https://github.com/apache/arrow/issues/47789) - [C++][FlightRPC] SQLGetFunctions Tests (#48031) +* [GH-47797](https://github.com/apache/arrow/issues/47797) - [CI][Python] Update Python installs for free-threaded wheel tasks (#47993) +* [GH-47800](https://github.com/apache/arrow/issues/47800) - [C++][CI] Fuzz more CSV reader types (#48398) +* [GH-47806](https://github.com/apache/arrow/issues/47806) - [CI] Rename deprecated docker-compose.yml to preferred compose.yaml file (#47954) +* [GH-47833](https://github.com/apache/arrow/issues/47833) - [C++] Add utf8proc option to Meson configuration (#47834) +* [GH-47881](https://github.com/apache/arrow/issues/47881) - [C++] Update fast_float version to 8.1.0 (#47882) +* [GH-47887](https://github.com/apache/arrow/issues/47887) - [C++][Integration] Enable extension types with C Data Interface tests (#47888) +* [GH-47891](https://github.com/apache/arrow/issues/47891) - [C++][Parquet] Generate a separate fuzz seed file for each column (#47892) +* [GH-47895](https://github.com/apache/arrow/issues/47895) - [C++][Parquet] Add prolog and epilog in unpack (#47896) +* [GH-47905](https://github.com/apache/arrow/issues/47905) - [C++][Parquet] MakeColumnStats should use user-provided memory pool (#47894) +* [GH-47926](https://github.com/apache/arrow/issues/47926) - [C++] Adopt alternative safe arithmetic library (#47958) +* [GH-47936](https://github.com/apache/arrow/issues/47936) - [R] docgen.R requires installed package instead of current working code (#47940) +* [GH-47939](https://github.com/apache/arrow/issues/47939) - [R] Update CRAN packaging checklist to update checksums and have make build call make clean (#47944) +* [GH-47974](https://github.com/apache/arrow/issues/47974) - [Docs] Remove stray documentation from Java and JS (#48006) +* [GH-47975](https://github.com/apache/arrow/issues/47975) - [Docs][Python] Remove experimental warning on PyCapsule documentation (#47976) +* [GH-47978](https://github.com/apache/arrow/issues/47978) - [C++][Parquet][CI] Add more compression codecs to fuzzing seed corpus (#47979) +* [GH-48000](https://github.com/apache/arrow/issues/48000) - [CI][Release] Publish RC GitHub Release as draft to allow immutable releases (#48059) +* [GH-48013](https://github.com/apache/arrow/issues/48013) - [R] Add CI job for musl (Alpine Linux) to replicate CRAN checks (#48014) +* [GH-48025](https://github.com/apache/arrow/issues/48025) - [C++][GLib] Replace instances where build path is being added to built artifacts (#48026) +* [GH-48055](https://github.com/apache/arrow/issues/48055) - [C++][FlightRPC] Allow spaces while parsing Table Type in ODBC (#48056) +* [GH-48074](https://github.com/apache/arrow/issues/48074) - [C++] Use FetchContent for bundled Abseil (#48075) +* [GH-48084](https://github.com/apache/arrow/issues/48084) - [C++][FlightRPC] Replace boost::optional with std::optional (#48323) +* [GH-48089](https://github.com/apache/arrow/issues/48089) - [C++][Parquet] Read statistics and other metadata when fuzzing (#48090) +* [GH-48091](https://github.com/apache/arrow/issues/48091) - [C++] Use FetchContent for bundled c-ares (#48092) +* [GH-48096](https://github.com/apache/arrow/issues/48096) - [Python][Parquet] Expose new WriterProperties::max_rows_per_page to Python bindings (#48101) +* [GH-48102](https://github.com/apache/arrow/issues/48102) - [Python] Remove deprecated Array.format method (#48324) +* [GH-48105](https://github.com/apache/arrow/issues/48105) - [C++][Parquet][IPC] Cap allocated memory when fuzzing (#48108) +* [GH-48112](https://github.com/apache/arrow/issues/48112) - [C++][Parquet] Use more accurate data length estimate when decoding PLAIN BYTE_ARRAY data (#48113) +* [GH-48123](https://github.com/apache/arrow/issues/48123) - [C++][Float16] Reimplement arrow::WithinUlp and Enable it for float16 (#48224) +* [GH-48139](https://github.com/apache/arrow/issues/48139) - [C++] Allow compilation for QNX versions up to 8 (#48140) +* [GH-48152](https://github.com/apache/arrow/issues/48152) - [CI][MATLAB] Bump MATLAB release to R2025b in the MATLAB GitHub Actions Workflow (#48153) +* [GH-48154](https://github.com/apache/arrow/issues/48154) - [MATAB][Packaging] Update MATLAB crossbow workflow to build against MATLAB `R2025b` (#48155) +* [GH-48163](https://github.com/apache/arrow/issues/48163) - [CI][Docs] Update preview docs task S3 secret to use (#48164) +* [GH-48167](https://github.com/apache/arrow/issues/48167) - [Python][C++][Compute] Add python bindings for scatter, inverse_permutation (#48267) +* [GH-48174](https://github.com/apache/arrow/issues/48174) - [CI][Dev] Fix shellcheck errors in ci/scripts/util_download_apache.sh (#48175) +* [GH-48176](https://github.com/apache/arrow/issues/48176) - [C++][Parquet] Fix arrow-ipc-message-internal-test failure (#48166) +* [GH-48178](https://github.com/apache/arrow/issues/48178) - [C++] Use FetchContent for bundled RE2 (#48179) +* [GH-48181](https://github.com/apache/arrow/issues/48181) - [C++] Use FetchContent for bundled Protobuf (#48183) +* [GH-48186](https://github.com/apache/arrow/issues/48186) - [CI][Dev] Remove ci/scripts/util_wait_for_it.sh (#48189) +* [GH-48218](https://github.com/apache/arrow/issues/48218) - [C++][Parquet] Fix Util & Level Conversion logic on big-endian (#48219) +* [GH-48245](https://github.com/apache/arrow/issues/48245) - [C++][Parquet] Simplify GetVlqInt (#48237) +* [GH-48248](https://github.com/apache/arrow/issues/48248) - [C++] Use FetchContent for bundled gRPC (#48250) +* [GH-48251](https://github.com/apache/arrow/issues/48251) - [C++][CI] Add CSV fuzzing seed corpus generator (#48252) +* [GH-48256](https://github.com/apache/arrow/issues/48256) - [Packaging][Linux] Use `closer.lua?action=download` URL (#48257) +* [GH-48260](https://github.com/apache/arrow/issues/48260) - [C++][Python][R] Move S3 bucket references to new bucket as Voltron Data ones will be removed soon (#48261) +* [GH-48275](https://github.com/apache/arrow/issues/48275) - [C++][Dev] Allow choosing verbosity when fuzzing (#48276) +* [GH-48287](https://github.com/apache/arrow/issues/48287) - [Ruby] Add minimum pure Ruby Apache Arrow reader implementation (#48288) +* [GH-48292](https://github.com/apache/arrow/issues/48292) - [Ruby] Add `Arrow::Column#to_arrow{,_array,_chunked_array}` (#48293) +* [GH-48295](https://github.com/apache/arrow/issues/48295) - [Ruby] Add support for reading Int8 array (#48296) +* [GH-48303](https://github.com/apache/arrow/issues/48303) - [CI] Remove needless `setup-dotnet` from `.github/workflows/dev.yml` (#48304) +* [GH-48306](https://github.com/apache/arrow/issues/48306) - [Ruby] Add support for reading binary array (#48307) +* [GH-48312](https://github.com/apache/arrow/issues/48312) - [C++][FlightRPC] Standalone ODBC MSVC CI (#48313) +* [GH-48315](https://github.com/apache/arrow/issues/48315) - [C++] Use FetchContent for bundled CRC32C (#48318) +* [GH-48316](https://github.com/apache/arrow/issues/48316) - [C++] Use FetchContent for bundled nlohmann-json (#48320) +* [GH-48317](https://github.com/apache/arrow/issues/48317) - [C++] Use FetchContent for bundled google-cloud-cpp (#48333) +* [GH-48326](https://github.com/apache/arrow/issues/48326) - [CI] Stop specifying hash for `actions/*` GitHub Actions (#48327) +* [GH-48328](https://github.com/apache/arrow/issues/48328) - [Ruby] Add support for reading UTF-8 array (#48329) +* [GH-48330](https://github.com/apache/arrow/issues/48330) - [Ruby] Add support for reading null array (#48331) +* [GH-48335](https://github.com/apache/arrow/issues/48335) - [C++][Parquet] Fuzz encrypted files (#48336) +* [GH-48337](https://github.com/apache/arrow/issues/48337) - [C++][Parquet] Improve column encryption API (#48338) +* [GH-48339](https://github.com/apache/arrow/issues/48339) - [C++] Enhance functions in util/ubsan.h to support types without a default constructor (#48429) +* [GH-48342](https://github.com/apache/arrow/issues/48342) - [R] Turn off gcs by default, also if it is on, bundle. (#48343) +* [GH-48346](https://github.com/apache/arrow/issues/48346) - [Ruby] Add support for reading boolean array (#48348) +* [GH-48347](https://github.com/apache/arrow/issues/48347) - [Ruby] Add support for reading list array (#48351) +* [GH-48355](https://github.com/apache/arrow/issues/48355) - [Python] Remove obsolete snprintf workaround for Python 3.9 (#48354) +* [GH-48358](https://github.com/apache/arrow/issues/48358) - [Ruby] Add support for reading float32 array (#48359) +* [GH-48360](https://github.com/apache/arrow/issues/48360) - [Ruby] Add support for reading large binary array (#48361) +* [GH-48362](https://github.com/apache/arrow/issues/48362) - [GLib][Ruby] Add FixedSizeListArray (#48369) +* [GH-48363](https://github.com/apache/arrow/issues/48363) - [GLib][Ruby] Add AssumeTimezoneOptions (#48370) +* [GH-48364](https://github.com/apache/arrow/issues/48364) - [GLib][Ruby] Add CumulativeOptions (#48371) +* [GH-48365](https://github.com/apache/arrow/issues/48365) - [GLib][Ruby] Add DayOfWeekOptions (#48372) +* [GH-48366](https://github.com/apache/arrow/issues/48366) - [GLib][Ruby] Add DictionaryEncodeOptions (#48373) +* [GH-48367](https://github.com/apache/arrow/issues/48367) - [GLib][Ruby] Add ElementWiseAggregateOptions (#48374) +* [GH-48368](https://github.com/apache/arrow/issues/48368) - [GLib][Ruby] Add ExtractRegexOptions (#48375) +* [GH-48380](https://github.com/apache/arrow/issues/48380) - [Ruby] Add support for reading float64 array (#48381) +* [GH-48382](https://github.com/apache/arrow/issues/48382) - [Ruby] Add support for reading struct array (#48383) +* [GH-48384](https://github.com/apache/arrow/issues/48384) - [C++][Docs][Parquet] Fix broken link for parquet-format spec (#48385) +* [GH-48386](https://github.com/apache/arrow/issues/48386) - [Ruby][Dev] Enable Layout/TrailingEmptyLines: final_newline cop (#48392) +* [GH-48388](https://github.com/apache/arrow/issues/48388) - [Ruby] Add support for reading map array (#48389) +* [GH-48395](https://github.com/apache/arrow/issues/48395) - [C++][Dev] Update fuzzing CMake preset (#48396) +* [GH-48400](https://github.com/apache/arrow/issues/48400) - [Python] Convert an old todo to a proper ticket in `test_copy_files_directory` (#48401) +* [GH-48402](https://github.com/apache/arrow/issues/48402) - [Python] Enable the relative path in test_write_dataset (#48403) +* [GH-48404](https://github.com/apache/arrow/issues/48404) - [Python] Add tests to to_table(filter=...) to reject a boolean expr (#48405) +* [GH-48406](https://github.com/apache/arrow/issues/48406) - [Python] Negative test for struct_field no-argument (ARROW-14853) (#48407) +* [GH-48410](https://github.com/apache/arrow/issues/48410) - [Ruby] Add support for reading large list array (#48411) +* [GH-48412](https://github.com/apache/arrow/issues/48412) - [Ruby] Add support for reading date32 array (#48413) +* [GH-48419](https://github.com/apache/arrow/issues/48419) - [Python] Fix test_parquet_file_too_small to catch only ArrowInvalid (#48420) +* [GH-48421](https://github.com/apache/arrow/issues/48421) - [Python] Enable test_orc_scan_options with batch_size (#48422) +* [GH-48423](https://github.com/apache/arrow/issues/48423) - [Ruby] Add support for reading date64 array (#48424) +* [GH-48425](https://github.com/apache/arrow/issues/48425) - [Ruby] Add support for reading dense union array (#48426) +* [GH-48435](https://github.com/apache/arrow/issues/48435) - [Ruby] Add support for reading sparse union array (#48439) +* [GH-48437](https://github.com/apache/arrow/issues/48437) - [Ruby] Add tests for large list array (#48438) +* [GH-48440](https://github.com/apache/arrow/issues/48440) - [Ruby] Add support for reading time32 array (#48441) +* [GH-48442](https://github.com/apache/arrow/issues/48442) - [Python] Remove workaround that excluded struct types from `chunked_arrays` (#48443) +* [GH-48444](https://github.com/apache/arrow/issues/48444) - [Python] Remove todo of implementing requested_schema in test_roundtrip_reader_capsule (#48445) +* [GH-48446](https://github.com/apache/arrow/issues/48446) - [Python] Remove todo of schema=name mismatch in `record_batches` (#48447) +* [GH-48452](https://github.com/apache/arrow/issues/48452) - [Python] Add tests for Date32 and Date64 array creation with masks (#48453) +* [GH-48461](https://github.com/apache/arrow/issues/48461) - [R][CI] Migrate Azure pipelines to GitHub actions (#48585) +* [GH-48463](https://github.com/apache/arrow/issues/48463) - [Python] Improve error message in CheckTypeExact arrow_to_pandas.cc (#48464) +* [GH-48471](https://github.com/apache/arrow/issues/48471) - [Ruby] Add support for reading Int16 and UInt16 arrays (#48472) +* [GH-48475](https://github.com/apache/arrow/issues/48475) - [Ruby] Add support for reading Int32 and UInt32 arrays (#48476) +* [GH-48479](https://github.com/apache/arrow/issues/48479) - [Ruby] Add support for reading Int64 and UInt64 arrays (#48480) +* [GH-48482](https://github.com/apache/arrow/issues/48482) - [GLib][Ruby] Add GArrowExtractRegexSpanOptions (#48483) +* [GH-48484](https://github.com/apache/arrow/issues/48484) - [GLib][Ruby] Add GArrowJoinOptions (#48485) +* [GH-48486](https://github.com/apache/arrow/issues/48486) - [GLib][Ruby] Add GArrowListFlattenOptions (#48487) +* [GH-48488](https://github.com/apache/arrow/issues/48488) - [GLib][Ruby] Add GArrowListSliceOptions (#48489) +* [GH-48490](https://github.com/apache/arrow/issues/48490) - [GLib][Ruby] Add GArrowMakeStructOptions (#48491) +* [GH-48492](https://github.com/apache/arrow/issues/48492) - [GLib][Ruby] Add MapLookupOptions (#48513) +* [GH-48493](https://github.com/apache/arrow/issues/48493) - [GLib][Ruby] Add ModeOptions (#48514) +* [GH-48494](https://github.com/apache/arrow/issues/48494) - [GLib][Ruby] Add NullOptions (#48515) +* [GH-48495](https://github.com/apache/arrow/issues/48495) - [GLib][Ruby] Add PadOptions (#48516) +* [GH-48496](https://github.com/apache/arrow/issues/48496) - [GLib][Ruby] Add PairwiseOptions (#48517) +* [GH-48497](https://github.com/apache/arrow/issues/48497) - [GLib][Ruby] Add PartitionNthOptions (#48518) +* [GH-48498](https://github.com/apache/arrow/issues/48498) - [GLib][Ruby] Add PivotWiderOptions (#48519) +* [GH-48499](https://github.com/apache/arrow/issues/48499) - [GLib][Ruby] Add RankQuantileOptions (#48520) +* [GH-48500](https://github.com/apache/arrow/issues/48500) - [GLib][Ruby] Add ReplaceSliceOptions (#48521) +* [GH-48501](https://github.com/apache/arrow/issues/48501) - [GLib][Ruby] Add ReplaceSubstringOptions (#48522) +* [GH-48502](https://github.com/apache/arrow/issues/48502) - [GLib][Ruby] Add RoundBinaryOptions (#48523) +* [GH-48503](https://github.com/apache/arrow/issues/48503) - [GLib][Ruby] Add RoundTemporalOptions (#48524) +* [GH-48504](https://github.com/apache/arrow/issues/48504) - [GLib][Ruby] Add SelectKOptions (#48525) +* [GH-48505](https://github.com/apache/arrow/issues/48505) - [GLib][Ruby] Add SkewOptions (#48526) +* [GH-48506](https://github.com/apache/arrow/issues/48506) - [GLib][Ruby] Add SliceOptions (#48527) +* [GH-48507](https://github.com/apache/arrow/issues/48507) - [GLib][Ruby] Add SplitOptions (#48528) +* [GH-48508](https://github.com/apache/arrow/issues/48508) - [GLib][Ruby] Add TDigestOptions (#48529) +* [GH-48509](https://github.com/apache/arrow/issues/48509) - [GLib][Ruby] Add TrimOptions (#48530) +* [GH-48510](https://github.com/apache/arrow/issues/48510) - [GLib][Ruby] Add WeekOptions (#48531) +* [GH-48511](https://github.com/apache/arrow/issues/48511) - [GLib][Ruby] Add WinsorizeOptions (#48532) +* [GH-48512](https://github.com/apache/arrow/issues/48512) - [GLib][Ruby] Add ZeroFillOptions (#48533) +* [GH-48535](https://github.com/apache/arrow/issues/48535) - [Ruby] Add support for reading time64 array (#48536) +* [GH-48537](https://github.com/apache/arrow/issues/48537) - [Ruby] Add support for reading fixed size binary array (#48538) +* [GH-48545](https://github.com/apache/arrow/issues/48545) - [C++][Parquet][CI] Add more encodings to fuzzing seed corpus (#48546) +* [GH-48551](https://github.com/apache/arrow/issues/48551) - [Ruby] Add support for reading large UTF-8 array (#48552) +* [GH-48553](https://github.com/apache/arrow/issues/48553) - [Ruby] Add support for reading timestamp array (#48554) +* [GH-48555](https://github.com/apache/arrow/issues/48555) - [C++] Use FetchContent for bundled opentelemetry (#48556) +* [GH-48557](https://github.com/apache/arrow/issues/48557) - [C++][Parquet][CI] Also encrypt nested columns in fuzz seed corpus (#48558) +* [GH-48572](https://github.com/apache/arrow/issues/48572) - [CI] Remove centos-7-cpp dockerfile and reference from compose (#48573) +* [GH-48579](https://github.com/apache/arrow/issues/48579) - [Ruby] Add support for reading duration array (#48580) +* [GH-48582](https://github.com/apache/arrow/issues/48582) - [CI][GPU][C++][Python] Add new CUDA jobs using the new self-hosted runners (#48583) +* [GH-48592](https://github.com/apache/arrow/issues/48592) - [C++] Use starts_with/ends_with methods (#48614) +* [GH-48602](https://github.com/apache/arrow/issues/48602) - [Ruby] Add support for reading interval arrays (#48603) +* [GH-48606](https://github.com/apache/arrow/issues/48606) - [CI][GLib] Increase NuGet timeout for vcpkg cache (#48638) +* [GH-48612](https://github.com/apache/arrow/issues/48612) - [Ruby] Add support for reading streaming format (#48613) +* [GH-48616](https://github.com/apache/arrow/issues/48616) - [GLib] Use `Arrow-${MAJOR}.${MINOR}.typelib` not `Arrow-1.0.typelib` (#48617) +* [GH-48631](https://github.com/apache/arrow/issues/48631) - [R] Non-API calls: 'ATTRIB', 'SET_ATTRIB' (#48634) +* [GH-48632](https://github.com/apache/arrow/issues/48632) - [R] Add NEWS.md entry for 22.0.0.1 (#48633) +* [GH-48642](https://github.com/apache/arrow/issues/48642) - [Ruby] Add support for reading decimal128 array (#48643) +* [GH-48654](https://github.com/apache/arrow/issues/48654) - [Python] Test timestamp from int without pandas dependency (#48655) +* [GH-48667](https://github.com/apache/arrow/issues/48667) - [Python] Remove unused imports from `python/pyarrow/__init__.py` (#48640) +* [GH-48668](https://github.com/apache/arrow/issues/48668) - [Python][Docs] Add python examples for compute functions `min/max/min_max` (#48648) +* [GH-48675](https://github.com/apache/arrow/issues/48675) - [C++][FlightRPC] Document StatementAttributeId enum values in ODBC SPI (#48676) +* [GH-48680](https://github.com/apache/arrow/issues/48680) - [GLib][Ruby] Add CSVWriter (#48681) +* [GH-48684](https://github.com/apache/arrow/issues/48684) - [C++] Update MakeListArray to use ListArray::FromArrays instead of constructor (#48685) +* [GH-48690](https://github.com/apache/arrow/issues/48690) - [R] Make "Can read Parquet files from a URL" less flaky (#48693) +* [GH-48703](https://github.com/apache/arrow/issues/48703) - [Ruby] Add support for reading decimal256 array (#48704) +* [GH-48705](https://github.com/apache/arrow/issues/48705) - [Ruby] Add support for reading dictionary array (#48706) +* [GH-48707](https://github.com/apache/arrow/issues/48707) - [C++][FlightRPC] Use IRD precision/scale defaults with ARD override in SQLGetData (#48708) +* [GH-48752](https://github.com/apache/arrow/issues/48752) - [Ruby] Skip ChunkedArray test on Windows due to flakiness (#48779) +* [GH-48755](https://github.com/apache/arrow/issues/48755) - [MATLAB] Rename getArrayProxyIDs to getProxyIDs (#48756) +* [GH-48757](https://github.com/apache/arrow/issues/48757) - [CI] Update arrow/.github /CODEOWNERS (#48758) +* [GH-48770](https://github.com/apache/arrow/issues/48770) - [CI] Add missing permissions declaration to workflows (#48771) + + + # Apache Arrow 6.0.1 (2021-11-18) ## Bug Fixes diff --git a/NOTICE.txt b/NOTICE.txt index 9b98364d2ab..8046f20a0b9 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Arrow -Copyright 2016-2024 The Apache Software Foundation +Copyright 2016-2026 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/c_glib/meson.build b/c_glib/meson.build index fddd390063e..77b64cfbe54 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -32,7 +32,7 @@ project( # * 22.04: 0.61.2 # * 24.04: 1.3.2 meson_version: '>=0.61.2', - version: '23.0.0-SNAPSHOT', + version: '23.0.1', ) version = meson.project_version() diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index 67c9958df4b..c7276a6e681 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "23.0.0-SNAPSHOT", + "version-string": "23.0.1", "$comment:dependencies": "We can enable gobject-introspection again once it's updated", "dependencies": [ "glib", diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 18d58f7bb2d..fec8488f954 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -39,7 +39,7 @@ lz4-c make meson ninja -nodejs +nodejs>=16 orc<2.1.0 pkg-config python diff --git a/ci/docker/conda-python-emscripten.dockerfile b/ci/docker/conda-python-emscripten.dockerfile index 47ff550cd59..878f918710f 100644 --- a/ci/docker/conda-python-emscripten.dockerfile +++ b/ci/docker/conda-python-emscripten.dockerfile @@ -39,6 +39,11 @@ RUN python -m pip install --no-cache-dir selenium==${selenium_version} && \ RUN pyodide_dist_url="https://github.com/pyodide/pyodide/releases/download/${pyodide_version}/pyodide-${pyodide_version}.tar.bz2" && \ wget -q "${pyodide_dist_url}" -O- | tar -xj -C / +# install node 20 (needed for async call support) +# and pthread-stubs for build, and unzip needed for chrome build to work +# xz is needed by emsdk to extract node tarballs +RUN conda install nodejs=20 unzip pthread-stubs make xz -c conda-forge + # install correct version of emscripten for this pyodide COPY ci/scripts/install_emscripten.sh /arrow/ci/scripts/ RUN bash /arrow/ci/scripts/install_emscripten.sh ~ /pyodide @@ -46,10 +51,6 @@ RUN bash /arrow/ci/scripts/install_emscripten.sh ~ /pyodide # make sure zlib is cached in the EMSDK folder RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib -# install node 20 (needed for async call support) -# and pthread-stubs for build, and unzip needed for chrome build to work -RUN conda install nodejs=20 unzip pthread-stubs make -c conda-forge - # install chrome for testing browser based runner COPY ci/scripts/install_chromedriver.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_chromedriver.sh "${chrome_version}" diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile deleted file mode 100644 index 44c845bb17e..00000000000 --- a/ci/docker/debian-12-cpp.dockerfile +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch=amd64 -FROM ${arch}/debian:12 -ARG arch - -ENV DEBIAN_FRONTEND noninteractive - -ARG llvm -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - wget && \ - if [ ${llvm} -ge 17 ]; then \ - wget -O /usr/share/keyrings/llvm-snapshot.asc \ - https://apt.llvm.org/llvm-snapshot.gpg.key && \ - (echo "Types: deb"; \ - echo "URIs: https://apt.llvm.org/$(lsb_release --codename --short)/"; \ - echo "Suites: llvm-toolchain-$(lsb_release --codename --short)-${llvm}"; \ - echo "Components: main"; \ - echo "Signed-By: /usr/share/keyrings/llvm-snapshot.asc") | \ - tee /etc/apt/sources.list.d/llvm.sources; \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ccache \ - clang-${llvm} \ - cmake \ - curl \ - g++ \ - gcc \ - gdb \ - git \ - libbenchmark-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgflags-dev \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ - liblz4-dev \ - libnghttp2-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libpsl-dev \ - libre2-dev \ - librtmp-dev \ - libsnappy-dev \ - libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libxml2-dev \ - libzstd-dev \ - llvm-${llvm}-dev \ - make \ - ninja-build \ - nlohmann-json3-dev \ - npm \ - patch \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - python3-venv \ - rapidjson-dev \ - rsync \ - tzdata \ - zlib1g-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_azurite.sh - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -# Prioritize system packages and local installation. -# -# The following dependencies will be downloaded due to missing/invalid packages -# provided by the distribution: -# - opentelemetry-cpp-dev is not packaged -ENV ARROW_ACERO=ON \ - ARROW_AZURE=ON \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=ON \ - ARROW_FLIGHT_SQL=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HOME=/usr/local \ - ARROW_JEMALLOC=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - Azure_SOURCE=BUNDLED \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - opentelemetry_cpp_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 \ - xsimd_SOURCE=BUNDLED diff --git a/ci/docker/debian-13-cpp.dockerfile b/ci/docker/debian-13-cpp.dockerfile index ca96b4177ff..1ea153f6872 100644 --- a/ci/docker/debian-13-cpp.dockerfile +++ b/ci/docker/debian-13-cpp.dockerfile @@ -55,26 +55,18 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ - libc-ares-dev \ libcurl4-openssl-dev \ libgflags-dev \ libgmock-dev \ libgoogle-glog-dev \ libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ liblz4-dev \ - libnghttp2-dev \ + libopentelemetry-proto-dev \ libprotobuf-dev \ libprotoc-dev \ - libpsl-dev \ libre2-dev \ - librtmp-dev \ libsnappy-dev \ libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ @@ -96,6 +88,7 @@ RUN apt-get update -y -q && \ rapidjson-dev \ rsync \ tzdata \ + tzdata-legacy \ zlib1g-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/ci/docker/fedora-42-r-clang.dockerfile b/ci/docker/fedora-42-r-clang.dockerfile new file mode 100644 index 00000000000..9bc970e0609 --- /dev/null +++ b/ci/docker/fedora-42-r-clang.dockerfile @@ -0,0 +1,224 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Fedora 42 container with Clang and R-devel for testing Arrow R package +# Replicates CRAN's r-devel-linux-x86_64-fedora-clang environment +# See: https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang + +ARG arch=amd64 +FROM ${arch}/fedora:42 + +# Install build dependencies +RUN dnf update -y && \ + dnf install -y \ + # Build tools + autoconf \ + automake \ + bzip2 \ + bzip2-devel \ + cmake \ + curl \ + curl-devel \ + diffutils \ + gcc \ + gcc-c++ \ + gcc-gfortran \ + git \ + java-latest-openjdk-devel \ + libicu-devel \ + libtool \ + libuuid-devel \ + libxcrypt-devel \ + lld \ + make \ + ninja-build \ + openssl-devel \ + patch \ + pcre2-devel \ + perl \ + pkgconfig \ + python3 \ + python3-pip \ + readline-devel \ + rsync \ + subversion \ + tar \ + texinfo \ + texlive-collection-basic \ + texlive-collection-latex \ + texlive-collection-latexrecommended \ + texlive-collection-fontsrecommended \ + texlive-inconsolata \ + texlive-parskip \ + texlive-natbib \ + texlive-fancyvrb \ + texlive-framed \ + unzip \ + wget \ + which \ + xz \ + xz-devel \ + zlib-devel \ + # X11 libraries for R + cairo-devel \ + libX11-devel \ + libXmu-devel \ + libXt-devel \ + libcurl-devel \ + libjpeg-turbo-devel \ + libpng-devel \ + libtiff-devel \ + pango-devel \ + tk-devel \ + # Additional R dependencies + libxml2-devel \ + fontconfig-devel \ + freetype-devel \ + fribidi-devel \ + harfbuzz-devel && \ + dnf clean all + +# Install LLVM/Clang from Fedora repos (will be the latest available in Fedora 42) +# Note: CRAN uses Clang 21, but we use whatever is available in Fedora repos +# This should be close enough for testing purposes +RUN dnf install -y \ + clang \ + clang-devel \ + clang-tools-extra \ + compiler-rt \ + flang \ + lld \ + llvm \ + llvm-devel \ + libcxx \ + libcxx-devel \ + libcxxabi \ + libcxxabi-devel \ + libomp \ + libomp-devel && \ + dnf clean all + +# Install locale support +RUN dnf install -y glibc-langpack-en && dnf clean all + +# Set up compiler environment to match CRAN's Fedora Clang configuration +# CRAN uses: -O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3 +# CRAN's clang is built to use libc++ by default; Fedora's defaults to libstdc++, +# so we must add -stdlib=libc++ explicitly +ENV CC=clang \ + CXX="clang++ -stdlib=libc++" \ + FC=flang-new \ + CFLAGS="-O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" \ + CXXFLAGS="-O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" \ + FFLAGS="-O2 -pedantic" \ + LDFLAGS="-fuse-ld=lld" + +# Set locale (glibc-langpack-en must be installed first) +ENV LANG=en_US.UTF-8 \ + LC_ALL=en_US.UTF-8 \ + LC_COLLATE=C \ + TZ=UTC + +# Build R-devel from source to match CRAN's R-devel +ARG r_version=devel +RUN cd /tmp && \ + if [ "$r_version" = "devel" ]; then \ + svn checkout https://svn.r-project.org/R/trunk R-devel && \ + cd R-devel/tools && \ + ./rsync-recommended; \ + else \ + wget -q https://cran.r-project.org/src/base/R-4/R-${r_version}.tar.gz && \ + tar xf R-${r_version}.tar.gz && \ + mv R-${r_version} R-devel; \ + fi && \ + cd /tmp/R-devel && \ + ./configure \ + --prefix=/usr/local \ + --enable-R-shlib \ + --enable-memory-profiling \ + --with-blas \ + --with-lapack \ + --with-x \ + --with-tcltk \ + CC="clang" \ + CXX="clang++ -stdlib=libc++" \ + FC="flang-new" \ + CFLAGS="-O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" \ + CXXFLAGS="-O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" \ + FFLAGS="-O2 -pedantic" \ + LDFLAGS="-fuse-ld=lld" && \ + make -j$(nproc) && \ + make install && \ + cd / && \ + rm -rf /tmp/R-devel + +# Verify R installation and clang +RUN R --version && clang --version + +# Set CRAN repo +RUN echo 'options(repos = c(CRAN = "https://cran.rstudio.com"))' >> $(R RHOME)/etc/Rprofile.site + +# Install pak for package management +RUN R -q -e 'install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/%s/%s/%s/%s", "devel", .Platform$pkgType, R.Version()$os, R.Version()$arch))' + +# Enable automatic system requirements installation +ENV PKG_SYSREQS=true \ + R_PKG_SYSREQS2=true + +# Set up parallel compilation +RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site + +# Configure R to use clang for package compilation (matching CRAN's Makevars) +# Fedora's clang defaults to libstdc++, so we must specify -stdlib=libc++ +RUN mkdir -p /root/.R && \ + echo "CC = clang" >> /root/.R/Makevars && \ + echo "CXX = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX11 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX14 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX17 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "CXX20 = clang++ -stdlib=libc++" >> /root/.R/Makevars && \ + echo "FC = flang-new" >> /root/.R/Makevars && \ + echo "CFLAGS = -O3 -Wall -pedantic -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXXFLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX11FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX14FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX17FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "CXX20FLAGS = -O3 -Wall -pedantic -frtti -stdlib=libc++ -Wp,-D_FORTIFY_SOURCE=3" >> /root/.R/Makevars && \ + echo "FFLAGS = -O2 -pedantic" >> /root/.R/Makevars && \ + echo "LDFLAGS = -fuse-ld=lld" >> /root/.R/Makevars + +# Configure image and install Arrow-specific tooling +COPY ci/scripts/r_docker_configure.sh /arrow/ci/scripts/ +COPY ci/etc/rprofile /arrow/ci/etc/ +COPY ci/scripts/r_install_system_dependencies.sh /arrow/ci/scripts/ +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/r_docker_configure.sh + +# Install sccache +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +# Install R package dependencies +COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ +COPY r/DESCRIPTION /arrow/r/ +RUN /arrow/ci/scripts/r_deps.sh /arrow + +# Verify setup +RUN R --version && \ + clang --version && \ + R -e "sessionInfo()" diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index b9f7c716e52..52090f8bb82 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -31,11 +31,9 @@ RUN apt-get update -y && \ lsb-release && \ gpg --keyserver keyserver.ubuntu.com \ --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ - gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ - gpg --no-default-keyring \ - --keyring /usr/share/keyrings/cran.gpg \ - --import - && \ - echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + gpg --armor --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + tee /usr/share/keyrings/cran.asc && \ + echo "deb [signed-by=/usr/share/keyrings/cran.asc] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ tee /etc/apt/sources.list.d/cran.list && \ if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ sed -i \ diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index ffdd0d44f5f..54033500773 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -113,10 +113,5 @@ RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] -# Remove once there are released Cython wheels for 3.13 free-threaded available -RUN if [ "${python_abi_tag}" = "cp313t" ]; then \ - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary ; \ - fi - COPY python/requirements-wheel-build.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b0add262e83..8ace741403e 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=22.0.0.9000 +pkgver=23.0.1 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0ad59bc308f..5d6d5e099ab 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -182,6 +182,15 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then # Some fuzz regression files may trigger huge memory allocations, # let the allocator return null instead of aborting. export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1" + export ARROW_FUZZING_VERBOSITY=1 + # Run golden IPC integration files: these should ideally load without errors, + # though some very old ones carry invalid data (such as decimal values + # larger than their advertised precision). + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-stream-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream") + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-file-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file") + # Run known crash files "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-* "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-* "${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-* diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bd61154430e..0990a842e94 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -46,19 +46,9 @@ else exit 1 fi -echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" -export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') - -# Remove once there are released Cython wheels for 3.13 free-threaded available -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary -fi - pip install \ --force-reinstall \ --only-binary=:all: \ - --target $PIP_SITE_PACKAGES \ --upgrade \ -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" @@ -177,7 +167,7 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation popd echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e6..75815dadb85 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -33,6 +33,10 @@ def validate_wheel(path): ) ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + for filename in ('LICENSE.txt', 'NOTICE.txt'): + assert any(info.filename.split("/")[-1] == filename + for info in f.filelist), \ + f"{filename} is missing from the wheel." print(f"The wheel: {wheels[0]} seems valid.") diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b4b7fed99fd..fc256d72785 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -133,7 +133,7 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM Build wheel -%PYTHON_CMD% setup.py bdist_wheel || exit /B 1 +%PYTHON_CMD% -m build --sdist --wheel . --no-isolation || exit /B 1 @REM Repair the wheel with delvewheel @REM diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b3..ceebbc5ad01 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -167,7 +167,7 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation echo "=== Strip symbols from wheel ===" mkdir -p dist/temp-fix-wheel diff --git a/compose.yaml b/compose.yaml index 84481e1af76..31bc5c81b95 100644 --- a/compose.yaml +++ b/compose.yaml @@ -138,6 +138,7 @@ x-hierarchy: - debian-docs - fedora-cpp: - fedora-python + - fedora-r-clang - python-sdist - ubuntu-cpp: - ubuntu-cpp-static @@ -1718,9 +1719,9 @@ services: cache_from: - ${REPO}:amd64-ubuntu-r-valgrind args: - base: wch1/r-debug:latest + base: rhub/valgrind:latest cmake: ${CMAKE} - r_bin: RDvalgrind + r_bin: R tz: ${TZ} environment: <<: [*common, *ccache, *sccache] @@ -1790,6 +1791,37 @@ services: - .:/arrow:delegated command: /arrow/ci/scripts/r_test.sh /arrow + fedora-r-clang: + # Usage: + # docker compose build fedora-r-clang + # docker compose run fedora-r-clang + # Tests R package on Fedora with Clang, simulating CRAN's + # r-devel-linux-x86_64-fedora-clang environment. + # R-devel is built from source with Clang and uses CRAN's compiler flags. + # See: https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang + # Parameters: + # FEDORA: 42 + # ARCH: amd64 + image: ${REPO}:${ARCH}-fedora-${FEDORA}-r-clang + build: + context: . + dockerfile: ci/docker/fedora-${FEDORA}-r-clang.dockerfile + cache_from: + - ${REPO}:${ARCH}-fedora-${FEDORA}-r-clang + args: + arch: ${ARCH} + shm_size: *shm-size + environment: + <<: [*common, *sccache] + LIBARROW_BINARY: "false" + ARROW_SOURCE_HOME: "/arrow" + ARROW_R_DEV: ${ARROW_R_DEV} + ARROW_USE_PKG_CONFIG: "false" + SKIP_VIGNETTES: "true" + NOT_CRAN: "false" + volumes: *fedora-volumes + command: /arrow/ci/scripts/r_test.sh /arrow + ############################## Integration ################################## conda-integration: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5b260c0eb68..d9e518b786b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -96,7 +96,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "23.0.0-SNAPSHOT") +set(ARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index b95d6491457..df937cc14cb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2867,6 +2867,13 @@ function(build_re2) fetchcontent_makeavailable(re2) + # Suppress -Wnested-anon-types warnings from RE2's use of anonymous types + # in anonymous unions (a compiler extension). + # See: https://github.com/apache/arrow/issues/48973 + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(re2 PRIVATE -Wno-nested-anon-types) + endif() + if(CMAKE_VERSION VERSION_LESS 3.28) set_property(DIRECTORY ${re2_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) endif() @@ -3366,10 +3373,6 @@ function(build_google_cloud_cpp_storage) # List of dependencies taken from https://github.com/googleapis/google-cloud-cpp/blob/main/doc/packaging.md build_crc32c_once() - # Curl is required on all platforms, but building it internally might also trip over S3's copy. - # For now, force its inclusion from the underlying system or fail. - find_curl() - fetchcontent_declare(google_cloud_cpp ${FC_DECLARE_COMMON_OPTIONS} URL ${google_cloud_cpp_storage_SOURCE_URL} @@ -3453,6 +3456,9 @@ if(ARROW_WITH_GOOGLE_CLOUD_CPP) ) endif() + # curl is required on all platforms. We always use system curl to + # avoid conflict. + find_curl() resolve_dependency(google_cloud_cpp_storage PC_PACKAGE_NAMES google_cloud_cpp_storage) get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage INTERFACE_INCLUDE_DIRECTORIES) diff --git a/cpp/meson.build b/cpp/meson.build index 16bb844d089..5632367cb95 100644 --- a/cpp/meson.build +++ b/cpp/meson.build @@ -19,7 +19,7 @@ project( 'arrow', 'cpp', 'c', - version: '23.0.0-SNAPSHOT', + version: '23.0.1', license: 'Apache-2.0', meson_version: '>=1.3.0', default_options: ['c_std=c11', 'warning_level=2', 'cpp_std=c++20'], diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a3a162616ec..6580476d38c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -558,7 +558,7 @@ Status CrossContainerMoveNotImplemented(const AzureLocation& src, "' requires moving data between containers, which is not implemented."); } -bool IsContainerNotFound(const Storage::StorageException& e) { +bool IsContainerNotFound(const Core::RequestFailedException& e) { // In some situations, only the ReasonPhrase is set and the // ErrorCode is empty, so we check both. if (e.ErrorCode == "ContainerNotFound" || @@ -782,7 +782,7 @@ class ObjectInputFile final : public io::RandomAccessFile { content_length_ = properties.Value.BlobSize; metadata_ = PropertiesToMetadata(properties.Value); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return PathNotFound(location_); } @@ -864,7 +864,7 @@ class ObjectInputFile final : public io::RandomAccessFile { return blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "DownloadTo from '", blob_client_->GetUrl(), "' at position ", position, " for ", nbytes, @@ -916,7 +916,7 @@ class ObjectInputFile final : public io::RandomAccessFile { Status CreateEmptyBlockBlob(const Blobs::BlockBlobClient& block_blob_client) { try { block_blob_client.UploadFrom(nullptr, 0); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "UploadFrom failed for '", block_blob_client.GetUrl(), "'. There is no existing blob at this location or the existing blob must be " @@ -929,7 +929,7 @@ Result GetBlockList( std::shared_ptr block_blob_client) { try { return block_blob_client->GetBlockList().Value; - } catch (Storage::StorageException& exception) { + } catch (Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "GetBlockList failed for '", block_blob_client->GetUrl(), "'. Cannot write to a file without first fetching the existing block list."); @@ -945,7 +945,7 @@ Status CommitBlockList(std::shared_ptr block_bl // previously committed blocks. // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body block_blob_client->CommitBlockList(block_ids, options); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "CommitBlockList failed for '", block_blob_client->GetUrl(), "'. Committing is required to flush an output/append stream."); @@ -957,7 +957,7 @@ Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string& Core::IO::MemoryBodyStream& content) { try { block_blob_client->StageBlock(id, content); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "StageBlock failed for '", block_blob_client->GetUrl(), "' new_block_id: '", id, @@ -1023,7 +1023,7 @@ class ObjectAppendStream final : public io::OutputStream { } content_length_ = properties.Value.BlobSize; pos_ = content_length_; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { // No file exists but on flat namespace its possible there is a directory // marker or an implied directory. Ensure there is no directory before starting @@ -1366,7 +1366,7 @@ Result CheckIfHierarchicalNamespaceIsEnabled( // Azurite issue detected. DCHECK(IsDfsEmulator(options)); return HNSSupport::kDisabled; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { // Flat namespace storage accounts with "soft delete" enabled return // // "Conflict - This endpoint does not support BlobStorageEvents @@ -1400,9 +1400,6 @@ Result CheckIfHierarchicalNamespaceIsEnabled( "Check for Hierarchical Namespace support on '", adlfs_client.GetUrl(), "' failed."); } - } catch (const Azure::Core::Http::TransportException& exception) { - return ExceptionToStatus(exception, "Check for Hierarchical Namespace support on '", - adlfs_client.GetUrl(), "' failed."); } catch (const std::exception& exception) { return Status::UnknownError( "Check for Hierarchical Namespace support on '", adlfs_client.GetUrl(), @@ -1436,7 +1433,7 @@ Result GetContainerPropsAsFileInfo(const AzureLocation& location, info.set_type(FileType::Directory); info.set_mtime(std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { info.set_type(FileType::NotFound); return info; @@ -1452,7 +1449,7 @@ Status CreateContainerIfNotExists(const std::string& container_name, try { container_client.CreateIfNotExists(); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create a container: ", container_name, ": ", container_client.GetUrl()); } @@ -1545,7 +1542,7 @@ class LeaseGuard { DCHECK(release_attempt_pending_); try { lease_client_->Release(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to release the ", lease_client_->GetLeaseId(), " lease"); } @@ -1588,7 +1585,7 @@ class LeaseGuard { break_or_expires_at_ = std::min(break_or_expires_at_, SteadyClock::now() + break_period.ValueOr(std::chrono::seconds{0})); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to break the ", lease_client_->GetLeaseId(), " lease expiring in ", remaining_time_ms().count(), "ms"); @@ -1783,7 +1780,7 @@ class AzureFileSystem::Impl { info.set_mtime( std::chrono::system_clock::time_point{properties.Value.LastModified}); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return FileInfo{location.all, FileType::NotFound}; } @@ -1858,7 +1855,7 @@ class AzureFileSystem::Impl { } info.set_type(FileType::NotFound); return info; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { return FileInfo{location.all, FileType::NotFound}; } @@ -1918,7 +1915,7 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(on_container(container)); } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to list account containers."); } return Status::OK(); @@ -1973,7 +1970,7 @@ class AzureFileSystem::Impl { } } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception) || exception.ErrorCode == "PathNotFound") { found = false; } else { @@ -2086,7 +2083,7 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); } } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { found = false; } else { @@ -2225,7 +2222,7 @@ class AzureFileSystem::Impl { if (container_info.type() == FileType::NotFound) { try { container_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create directory '", location.all, "': ", container_client.GetUrl()); } @@ -2252,7 +2249,7 @@ class AzureFileSystem::Impl { const auto& nonexistent_location = nonexistent_locations[i - 1]; try { create_if_not_exists(container_client, nonexistent_location); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to create directory '", location.all, "': ", container_client.GetUrl()); } @@ -2270,7 +2267,7 @@ class AzureFileSystem::Impl { try { create_if_not_exists(container_client, location); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { auto parent = location.parent(); return PathNotFound(parent); @@ -2378,7 +2375,7 @@ class AzureFileSystem::Impl { try { EnsureEmptyDirExistsImplThatThrows(container_client, location.path); return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, operation_name, " failed to ensure empty directory marker '", location.path, "' exists in container: ", container_client.GetUrl()); @@ -2396,7 +2393,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { return PathNotFound(location); } @@ -2492,7 +2489,7 @@ class AzureFileSystem::Impl { if (!deferred_responses.empty()) { container_client.SubmitBatch(batch); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", location.path, ": ", container_client.GetUrl()); } @@ -2502,7 +2499,7 @@ class AzureFileSystem::Impl { try { auto delete_result = deferred_response.GetResponse(); success = delete_result.Value.Deleted; - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { success = false; } if (!success) { @@ -2521,7 +2518,7 @@ class AzureFileSystem::Impl { } } return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to list blobs in a directory: ", location.path, ": ", container_client.GetUrl()); @@ -2557,7 +2554,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.ErrorCode == "FilesystemNotFound" || exception.ErrorCode == "PathNotFound") { if (require_dir_to_exist) { @@ -2584,7 +2581,7 @@ class AzureFileSystem::Impl { auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); try { sub_directory_client.DeleteRecursive(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "Failed to delete a sub directory: ", location.container, kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); @@ -2596,7 +2593,7 @@ class AzureFileSystem::Impl { auto sub_file_client = adlfs_client.GetFileClient(path.Name); try { sub_file_client.Delete(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus( exception, "Failed to delete a sub file: ", location.container, kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); @@ -2605,7 +2602,7 @@ class AzureFileSystem::Impl { } } return Status::OK(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { return Status::OK(); } @@ -2634,7 +2631,7 @@ class AzureFileSystem::Impl { try { [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (IsContainerNotFound(exception)) { if (allow_missing_container) { return nullptr; @@ -2674,7 +2671,7 @@ class AzureFileSystem::Impl { try { [[maybe_unused]] auto result = lease_client->Acquire(lease_duration); DCHECK_EQ(result.Value.LeaseId, lease_client->GetLeaseId()); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { if (allow_missing) { return nullptr; @@ -2749,7 +2746,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { // ErrorCode can be "FilesystemNotFound", "PathNotFound"... if (require_file_to_exist) { @@ -2841,7 +2838,7 @@ class AzureFileSystem::Impl { // Only the "*IfExists" functions ever set Deleted to false. // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { return check_if_location_exists_as_dir(); } @@ -2906,7 +2903,7 @@ class AzureFileSystem::Impl { if (!dest_is_empty) { return NotEmpty(dest); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to check that '", dest.container, "' is empty: ", dest_container_client.GetUrl()); } @@ -2936,6 +2933,10 @@ class AzureFileSystem::Impl { return ExceptionToStatus(exception, "Failed to rename container '", src.container, "' to '", dest.container, "': ", blob_service_client_->GetUrl()); + } catch (const Core::RequestFailedException& exception) { + return ExceptionToStatus(exception, "Failed to rename container '", src.container, + "' to '", dest.container, + "': ", blob_service_client_->GetUrl()); } } else if (dest_is_empty) { // Even if we deleted the empty dest.container, RenameBlobContainer() would still @@ -2972,11 +2973,11 @@ class AzureFileSystem::Impl { src_lease_guard.BreakBeforeDeletion(kTimeNeededForContainerDeletion); src_container_client.Delete(options); src_lease_guard.Forget(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Failed to delete empty container: '", src.container, "': ", src_container_client.GetUrl()); } - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { return ExceptionToStatus(exception, "Unable to replace empty container: '", dest.all, "': ", dest_container_client.GetUrl()); } @@ -3117,7 +3118,7 @@ class AzureFileSystem::Impl { src_lease_guard.BreakBeforeDeletion(kTimeNeededForFileOrDirectoryRename); src_adlfs_client.RenameFile(src_path, dest_path, options); src_lease_guard.Forget(); - } catch (const Storage::StorageException& exception) { + } catch (const Core::RequestFailedException& exception) { // https://learn.microsoft.com/en-gb/rest/api/storageservices/datalakestoragegen2/path/create if (exception.StatusCode == Http::HttpStatusCode::NotFound) { if (exception.ErrorCode == "PathNotFound") { diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc index 99fa8b238dd..df09a77afb7 100644 --- a/cpp/src/arrow/flight/cookie_internal.cc +++ b/cpp/src/arrow/flight/cookie_internal.cc @@ -64,6 +64,11 @@ size_t CaseInsensitiveHash::operator()(const std::string& key) const { return std::hash{}(upper_string); } +bool CaseInsensitiveEqual::operator()(const std::string& lhs, + const std::string& rhs) const { + return strcasecmp(lhs.c_str(), rhs.c_str()) == 0; +} + Cookie Cookie::Parse(std::string_view cookie_header_value) { // Parse the cookie string. If the cookie has an expiration, record it. // If the cookie has a max-age, calculate the current time + max_age and set that as diff --git a/cpp/src/arrow/flight/cookie_internal.h b/cpp/src/arrow/flight/cookie_internal.h index 62c0390c585..98b936edb33 100644 --- a/cpp/src/arrow/flight/cookie_internal.h +++ b/cpp/src/arrow/flight/cookie_internal.h @@ -41,6 +41,12 @@ class ARROW_FLIGHT_EXPORT CaseInsensitiveComparator { bool operator()(const std::string& t1, const std::string& t2) const; }; +/// \brief Case insensitive equality comparator for use by unordered cookie map. +class ARROW_FLIGHT_EXPORT CaseInsensitiveEqual { + public: + bool operator()(const std::string& lhs, const std::string& rhs) const; +}; + /// \brief Case insensitive hasher for use by cookie caching map. Cookies are not /// case-sensitive. class ARROW_FLIGHT_EXPORT CaseInsensitiveHash { @@ -117,7 +123,7 @@ class ARROW_FLIGHT_EXPORT CookieCache { // Mutex must be used to protect cookie cache. std::mutex mutex_; - std::unordered_map + std::unordered_map cookies; }; diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc index 422c45fc059..8b2b564d8db 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc @@ -157,9 +157,6 @@ void FlightSqlConnection::Connect(const ConnPropertyMap& properties, client_options_ = BuildFlightClientOptions(properties, missing_attr, flight_ssl_configs); - const std::shared_ptr& cookie_factory = GetCookieFactory(); - client_options_.middleware.push_back(cookie_factory); - std::unique_ptr flight_client; ThrowIfNotOK(FlightClient::Connect(location, client_options_).Value(&flight_client)); PopulateMetadataSettings(properties); diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc index 8b24762bfc3..c6a813cfd48 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/flight/sql/odbc/odbc_impl/flight_sql_driver.h" #include "arrow/compute/api.h" @@ -37,6 +39,8 @@ FlightSqlDriver::FlightSqlDriver() RegisterComputeKernels(); // Register log after compute kernels check to avoid segfaults RegisterLog(); + // GH-48637: Disable Absl Deadlock detection from upstream projects + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); } FlightSqlDriver::~FlightSqlDriver() { diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0dae888ca0e..14a0fe4215e 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -285,8 +285,9 @@ class BufferedInputStream::Impl : public BufferedBase { // Resize internal read buffer. Note that the internal buffer-size // should not be larger than the raw_read_bound_. - // It might change the buffer_size_, but will not change buffer states - // buffer_pos_ and bytes_buffered_. + // It might change the buffer_size_, and may reset buffer_pos_ to 0 + // when bytes_buffered_ == 0 to reuse the beginning of the buffer. + // bytes_buffered_ will not be changed. Status SetBufferSize(int64_t new_buffer_size) { if (new_buffer_size <= 0) { return Status::Invalid("Buffer size should be positive"); @@ -297,12 +298,14 @@ class BufferedInputStream::Impl : public BufferedBase { new_buffer_size, ", buffer_pos: ", buffer_pos_, ", bytes_buffered: ", bytes_buffered_, ", buffer_size: ", buffer_size_); } + bool need_reset_buffer_pos = false; if (raw_read_bound_ >= 0) { // No need to reserve space for more than the total remaining number of bytes. if (bytes_buffered_ == 0) { - // Special case: we can not keep the current buffer because it does not + // Special case: we can override data in the current buffer because it does not // contain any required data. new_buffer_size = std::min(new_buffer_size, raw_read_bound_ - raw_read_total_); + need_reset_buffer_pos = true; } else { // We should keep the current buffer because it contains data that // can be read. @@ -311,7 +314,11 @@ class BufferedInputStream::Impl : public BufferedBase { buffer_pos_ + bytes_buffered_ + (raw_read_bound_ - raw_read_total_)); } } - return ResizeBuffer(new_buffer_size); + auto status = ResizeBuffer(new_buffer_size); + if (status.ok() && need_reset_buffer_pos) { + buffer_pos_ = 0; + } + return status; } Result Peek(int64_t nbytes) { diff --git a/cpp/src/arrow/io/buffered_test.cc b/cpp/src/arrow/io/buffered_test.cc index 1d4805f580c..efaec09dc7c 100644 --- a/cpp/src/arrow/io/buffered_test.cc +++ b/cpp/src/arrow/io/buffered_test.cc @@ -514,6 +514,39 @@ TEST_F(TestBufferedInputStream, PeekPastBufferedBytes) { ASSERT_EQ(0, buffered_->bytes_buffered()); } +TEST_F(TestBufferedInputStream, PeekAfterExhaustingBuffer) { + // GH-48311: When bytes_buffered_ == 0 and raw_read_bound_ >= 0, + // SetBufferSize should reset buffer_pos_ to 0 and reuse the beginning of the buffer + MakeExample1(/*buffer_size=*/10, default_memory_pool(), /*raw_read_bound=*/25); + + // Fill the buffer + ASSERT_OK_AND_ASSIGN(auto view, buffered_->Peek(10)); + EXPECT_EQ(view, kExample1.substr(0, 10)); + ASSERT_EQ(10, buffered_->bytes_buffered()); + ASSERT_EQ(10, buffered_->buffer_size()); + + // Read all buffered bytes to exhaust the buffer (bytes_buffered_ == 0), + // at this point buffer_pos_ is non-zero + ASSERT_OK_AND_ASSIGN(auto bytes, buffered_->Read(10)); + EXPECT_EQ(std::string_view(*bytes), kExample1.substr(0, 10)); + ASSERT_EQ(0, buffered_->bytes_buffered()); + ASSERT_EQ(10, buffered_->buffer_size()); + + // Peek should trigger SetBufferSize with bytes_buffered_ == 0, + // which should reset buffer_pos_ to 0 and reuse the beginning of the buffer, + // so resulting size of the buffer should be 15 instead of 25 + ASSERT_OK_AND_ASSIGN(view, buffered_->Peek(15)); + EXPECT_EQ(view, kExample1.substr(10, 15)); + ASSERT_EQ(15, buffered_->bytes_buffered()); + ASSERT_EQ(15, buffered_->buffer_size()); + + // Do read just in case + ASSERT_OK_AND_ASSIGN(bytes, buffered_->Read(15)); + EXPECT_EQ(std::string_view(*bytes), kExample1.substr(10, 15)); + ASSERT_EQ(0, buffered_->bytes_buffered()); + ASSERT_EQ(15, buffered_->buffer_size()); +} + class TestBufferedInputStreamBound : public ::testing::Test { public: void SetUp() { CreateExample(/*bounded=*/true); } diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 7919878f148..8be09956f10 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -375,6 +375,8 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le decoder.next_required_size()); } + // TODO(GH-48846): we should take a body_length just like ReadMessageAsync + // and read metadata + body in one go. ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length)); if (metadata->size() < metadata_length) { return Status::Invalid("Expected to read ", metadata_length, diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 315d8bd07d9..9f7df541bd7 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1252,40 +1252,55 @@ struct FileGeneratorWriterHelper : public FileWriterHelper { Status ReadBatches(const IpcReadOptions& options, RecordBatchVector* out_batches, ReadStats* out_stats = nullptr, MetadataVector* out_metadata_list = nullptr) override { - std::shared_ptr buf_reader; - if (kCoalesce) { - // Use a non-zero-copy enabled BufferReader so we can test paths properly - buf_reader = std::make_shared(buffer_); - } else { - buf_reader = std::make_shared(buffer_); - } - AsyncGenerator> generator; + // The generator doesn't track stats. + EXPECT_EQ(nullptr, out_stats); - { - auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); - // Do NOT assert OK since some tests check whether this fails properly - EXPECT_FINISHES(fut); - ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); - EXPECT_EQ(num_batches_written_, reader->num_record_batches()); - // Generator will keep reader alive internally - ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); - } + auto read_batches = [&](bool pre_buffer) -> Result { + std::shared_ptr buf_reader; + if (kCoalesce) { + // Use a non-zero-copy enabled BufferReader so we can test paths properly + buf_reader = std::make_shared(buffer_); + } else { + buf_reader = std::make_shared(buffer_); + } + AsyncGenerator> generator; + + { + auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); + ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); + EXPECT_EQ(num_batches_written_, reader->num_record_batches()); + if (pre_buffer) { + RETURN_NOT_OK(reader->PreBufferMetadata(/*indices=*/{})); + } + // Generator will keep reader alive internally + ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); + } - // Generator is async-reentrant - std::vector>> futures; + // Generator is async-reentrant + std::vector>> futures; + for (int i = 0; i < num_batches_written_; ++i) { + futures.push_back(generator()); + } + auto fut = generator(); + ARROW_ASSIGN_OR_RAISE(auto final_batch, fut.result()); + EXPECT_EQ(nullptr, final_batch); + + RecordBatchVector batches; + for (auto& future : futures) { + ARROW_ASSIGN_OR_RAISE(auto batch, future.result()); + EXPECT_NE(nullptr, batch); + batches.push_back(batch); + } + return batches; + }; + + ARROW_ASSIGN_OR_RAISE(*out_batches, read_batches(/*pre_buffer=*/false)); + // Also read with pre-buffered metadata, and check the results are equal + ARROW_ASSIGN_OR_RAISE(auto batches_pre_buffered, read_batches(/*pre_buffer=*/true)); for (int i = 0; i < num_batches_written_; ++i) { - futures.push_back(generator()); - } - auto fut = generator(); - EXPECT_FINISHES_OK_AND_EQ(nullptr, fut); - for (auto& future : futures) { - EXPECT_FINISHES_OK_AND_ASSIGN(auto batch, future); - out_batches->push_back(batch); + AssertBatchesEqual(*batches_pre_buffered[i], *(*out_batches)[i], + /*check_metadata=*/true); } - - // The generator doesn't track stats. - EXPECT_EQ(nullptr, out_stats); - return Status::OK(); } }; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4910b1596c3..046eacb6ced 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -54,6 +54,7 @@ #include "arrow/util/compression.h" #include "arrow/util/endian.h" #include "arrow/util/fuzz_internal.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/parallel.h" @@ -72,6 +73,7 @@ namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; +using internal::AddWithOverflow; using internal::checked_cast; using internal::checked_pointer_cast; @@ -177,14 +179,16 @@ class ArrayLoader { explicit ArrayLoader(const flatbuf::RecordBatch* metadata, MetadataVersion metadata_version, const IpcReadOptions& options, - int64_t file_offset) + int64_t file_offset, int64_t file_length) : metadata_(metadata), metadata_version_(metadata_version), file_(nullptr), file_offset_(file_offset), + file_length_(file_length), max_recursion_depth_(options.max_recursion_depth) {} Status ReadBuffer(int64_t offset, int64_t length, std::shared_ptr* out) { + // This construct permits overriding GetBuffer at compile time if (skip_io_) { return Status::OK(); } @@ -194,7 +198,10 @@ class ArrayLoader { if (length < 0) { return Status::Invalid("Negative length for reading buffer ", buffer_index_); } - // This construct permits overriding GetBuffer at compile time + auto read_end = AddWithOverflow({offset, length}); + if (!read_end.has_value() || (file_length_.has_value() && read_end > file_length_)) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } if (!bit_util::IsMultipleOf8(offset)) { return Status::Invalid("Buffer ", buffer_index_, " did not start on 8-byte aligned offset: ", offset); @@ -202,6 +209,9 @@ class ArrayLoader { if (file_) { return file_->ReadAt(offset, length).Value(out); } else { + if (!AddWithOverflow({read_end.value(), file_offset_}).has_value()) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } read_request_.RequestRange(offset + file_offset_, length, out); return Status::OK(); } @@ -235,7 +245,7 @@ class ArrayLoader { } Status GetBuffer(int buffer_index, std::shared_ptr* out) { - auto buffers = metadata_->buffers(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); @@ -250,18 +260,25 @@ class ArrayLoader { } } - Result GetVariadicCount(int i) { + Result GetVariadicCount(int i) { auto* variadic_counts = metadata_->variadicBufferCounts(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (i >= static_cast(variadic_counts->size())) { return Status::IOError("variadic_count_index out of range."); } int64_t count = variadic_counts->Get(i); - if (count < 0 || count > std::numeric_limits::max()) { - return Status::IOError( - "variadic_count must be representable as a positive int32_t, got ", count, "."); + if (count < 0) { + return Status::IOError("variadic buffer count must be positive"); + } + // Detect an excessive variadic buffer count to avoid potential memory blowup + // (GH-48900). + const auto max_buffer_count = static_cast(buffers->size()) - buffer_index_; + if (count > max_buffer_count) { + return Status::IOError("variadic buffer count exceeds available number of buffers"); } - return static_cast(count); + return count; } Status GetFieldMetadata(int field_index, ArrayData* out) { @@ -286,6 +303,16 @@ class ArrayLoader { // we can skip that buffer without reading from shared memory RETURN_NOT_OK(GetFieldMetadata(field_index_++, out_)); + if (::arrow::internal::has_variadic_buffers(type_id)) { + ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, + GetVariadicCount(variadic_count_index_++)); + const int64_t start = static_cast(out_->buffers.size()); + // NOTE: this must be done before any other call to `GetBuffer` because + // BatchDataReadRequest will keep pointers to `std::shared_ptr` + // objects. + out_->buffers.resize(start + data_buffer_count); + } + if (internal::HasValidityBitmap(type_id, metadata_version_)) { // Extract null_bitmap which is common to all arrays except for unions // and nulls. @@ -294,6 +321,7 @@ class ArrayLoader { } buffer_index_++; } + return Status::OK(); } @@ -392,14 +420,9 @@ class ArrayLoader { Status Visit(const BinaryViewType& type) { out_->buffers.resize(2); - RETURN_NOT_OK(LoadCommon(type.id())); - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - - ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, - GetVariadicCount(variadic_count_index_++)); - out_->buffers.resize(data_buffer_count + 2); - for (size_t i = 0; i < data_buffer_count; ++i) { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + RETURN_NOT_OK(LoadCommon(type.id())); // also initializes variadic buffers + for (int64_t i = 1; i < static_cast(out_->buffers.size()); ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i])); } return Status::OK(); } @@ -497,6 +520,7 @@ class ArrayLoader { const MetadataVersion metadata_version_; io::RandomAccessFile* file_; int64_t file_offset_; + std::optional file_length_; int max_recursion_depth_; int buffer_index_ = 0; int field_index_ = 0; @@ -1167,8 +1191,19 @@ namespace { // Common functions used in both the random-access file reader and the // asynchronous generator -inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { - return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()}; +Result FileBlockFromFlatbuffer(const flatbuf::Block* fb_block, + int64_t max_offset) { + auto block = + FileBlock{fb_block->offset(), fb_block->metaDataLength(), fb_block->bodyLength()}; + if (block.metadata_length < 0 || block.body_length < 0 || block.offset < 0) { + return Status::IOError("Invalid Block in IPC file footer"); + } + auto block_end = + AddWithOverflow({block.offset, block.metadata_length, block.body_length}); + if (!block_end.has_value() || block_end > max_offset) { + return Status::IOError("Invalid Block in IPC file footer"); + } + return block; } Status CheckAligned(const FileBlock& block) { @@ -1180,31 +1215,36 @@ Status CheckAligned(const FileBlock& block) { return Status::OK(); } +template +Result CheckBodyLength(MessagePtr message, const FileBlock& block) { + if (message->body_length() != block.body_length) { + return Status::Invalid( + "Mismatching body length for IPC message " + "(Block.bodyLength: ", + block.body_length, " vs. Message.bodyLength: ", message->body_length(), ")"); + } + // NOTE: we cannot check metadata length as easily as we would have to account + // for the additional IPC signalisation (such as optional continuation bytes). + return message; +} + Result> ReadMessageFromBlock( const FileBlock& block, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader) { RETURN_NOT_OK(CheckAligned(block)); - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, file, fields_loader)); - return message; + return CheckBodyLength(std::move(message), block); } Future> ReadMessageFromBlockAsync( const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) { - if (!bit_util::IsMultipleOf8(block.offset) || - !bit_util::IsMultipleOf8(block.metadata_length) || - !bit_util::IsMultipleOf8(block.body_length)) { - return Status::Invalid("Unaligned block in IPC file"); - } - - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - + RETURN_NOT_OK(CheckAligned(block)); return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file, - io_context); + io_context) + .Then([block](std::shared_ptr message) { + return CheckBodyLength(std::move(message), block); + }); } class RecordBatchFileReaderImpl; @@ -1351,8 +1391,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { read_options, file, schema, &inclusion_mask); }; } - ARROW_ASSIGN_OR_RAISE(auto message, - ReadMessageFromBlock(GetRecordBatchBlock(i), fields_loader)); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(block, fields_loader)); CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); @@ -1368,8 +1408,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Result CountRows() override { int64_t total = 0; for (int i = 0; i < num_record_batches(); i++) { - ARROW_ASSIGN_OR_RAISE(auto outer_message, - ReadMessageFromBlock(GetRecordBatchBlock(i))); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto outer_message, ReadMessageFromBlock(block)); auto metadata = outer_message->metadata(); const flatbuf::Message* message = nullptr; RETURN_NOT_OK( @@ -1483,13 +1523,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status DoPreBufferMetadata(const std::vector& indices) { RETURN_NOT_OK(CacheMetadata(indices)); - EnsureDictionaryReadStarted(); + RETURN_NOT_OK(EnsureDictionaryReadStarted()); Future<> all_metadata_ready = WaitForMetadatas(indices); for (int index : indices) { Future> metadata_loaded = all_metadata_ready.Then([this, index]() -> Result> { stats_.num_messages.fetch_add(1, std::memory_order_relaxed); - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE( std::shared_ptr metadata, metadata_cache_->Read({block.offset, block.metadata_length})); @@ -1538,12 +1578,12 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } }; - FileBlock GetRecordBatchBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i)); + Result GetRecordBatchBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i), footer_offset_); } - FileBlock GetDictionaryBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i)); + Result GetDictionaryBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i), footer_offset_); } Result> ReadMessageFromBlock( @@ -1556,16 +1596,26 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status ReadDictionaries() { // Read all the dictionaries + std::vector> messages(num_dictionaries()); + for (int i = 0; i < num_dictionaries(); ++i) { + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); + ARROW_ASSIGN_OR_RAISE(messages[i], ReadMessageFromBlock(block)); + } + return ReadDictionaries(messages); + } + + Status ReadDictionaries( + const std::vector>& dictionary_messages) { + DCHECK_EQ(dictionary_messages.size(), static_cast(num_dictionaries())); IpcReadContext context(&dictionary_memo_, options_, swap_endian_); for (int i = 0; i < num_dictionaries(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i))); - RETURN_NOT_OK(ReadOneDictionary(message.get(), context)); - stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); + RETURN_NOT_OK(ReadOneDictionary(i, dictionary_messages[i].get(), context)); } return Status::OK(); } - Status ReadOneDictionary(Message* message, const IpcReadContext& context) { + Status ReadOneDictionary(int dict_index, Message* message, + const IpcReadContext& context) { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; @@ -1575,44 +1625,48 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } else if (kind == DictionaryKind::Delta) { stats_.num_dictionary_deltas.fetch_add(1, std::memory_order_relaxed); } + stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); return Status::OK(); } - void AddDictionaryRanges(std::vector* ranges) const { + Status AddDictionaryRanges(std::vector* ranges) const { // Adds all dictionaries to the range cache for (int i = 0; i < num_dictionaries(); ++i) { - FileBlock block = GetDictionaryBlock(i); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); ranges->push_back({block.offset, block.metadata_length + block.body_length}); } + return Status::OK(); } - void AddMetadataRanges(const std::vector& indices, - std::vector* ranges) { + Status AddMetadataRanges(const std::vector& indices, + std::vector* ranges) { for (int index : indices) { - FileBlock block = GetRecordBatchBlock(static_cast(index)); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ranges->push_back({block.offset, block.metadata_length}); } + return Status::OK(); } Status CacheMetadata(const std::vector& indices) { std::vector ranges; if (!read_dictionaries_) { - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); } - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->Cache(std::move(ranges)); } - void EnsureDictionaryReadStarted() { + Status EnsureDictionaryReadStarted() { if (!dictionary_load_finished_.is_valid()) { read_dictionaries_ = true; std::vector ranges; - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); dictionary_load_finished_ = metadata_cache_->WaitFor(std::move(ranges)).Then([this] { return ReadDictionaries(); }); } + return Status::OK(); } Status WaitForDictionaryReadFinished() { @@ -1630,7 +1684,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Future<> WaitForMetadatas(const std::vector& indices) { std::vector ranges; - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->WaitFor(std::move(ranges)); } @@ -1674,12 +1728,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { const flatbuf::RecordBatch* batch, IpcReadContext context, io::RandomAccessFile* file, std::shared_ptr owned_file, - int64_t block_data_offset) + int64_t block_data_offset, int64_t block_data_length) : schema(std::move(sch)), context(std::move(context)), file(file), owned_file(std::move(owned_file)), - loader(batch, context.metadata_version, context.options, block_data_offset), + loader(batch, context.metadata_version, context.options, block_data_offset, + block_data_length), columns(schema->num_fields()), cache(file, file->io_context(), io::CacheOptions::LazyDefaults()), length(batch->length()) {} @@ -1778,14 +1833,15 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { return dictionary_load_finished_.Then([message_fut] { return message_fut; }) .Then([this, index](const std::shared_ptr& message_obj) -> Future> { - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE(auto message, GetFlatbufMessage(message_obj)); ARROW_ASSIGN_OR_RAISE(auto batch, GetBatchFromMessage(message)); ARROW_ASSIGN_OR_RAISE(auto context, GetIpcReadContext(message, batch)); auto read_context = std::make_shared( schema_, batch, std::move(context), file_, owned_file_, - block.offset + static_cast(block.metadata_length)); + block.offset + static_cast(block.metadata_length), + block.body_length); RETURN_NOT_OK(read_context->CalculateLoadRequest()); return read_context->ReadAsync().Then( [read_context] { return read_context->CreateRecordBatch(); }); @@ -1904,25 +1960,31 @@ Future WholeIpcFileRecordBatchGenerator::operator()() { auto state = state_; if (!read_dictionaries_.is_valid()) { - std::vector>> messages(state->num_dictionaries()); - for (int i = 0; i < state->num_dictionaries(); i++) { - auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i)); - messages[i] = ReadBlock(block); - } - auto read_messages = All(std::move(messages)); - if (executor_) read_messages = executor_->Transfer(read_messages); - read_dictionaries_ = read_messages.Then( - [=](const std::vector>>& maybe_messages) - -> Status { - ARROW_ASSIGN_OR_RAISE(auto messages, - arrow::internal::UnwrapOrRaise(maybe_messages)); - return ReadDictionaries(state.get(), std::move(messages)); - }); + if (state->dictionary_load_finished_.is_valid()) { + // PreBufferMetadata has started reading dictionaries in the background + read_dictionaries_ = state->dictionary_load_finished_; + } else { + // Start reading dictionaries + std::vector>> messages(state->num_dictionaries()); + for (int i = 0; i < state->num_dictionaries(); i++) { + ARROW_ASSIGN_OR_RAISE(auto block, state->GetDictionaryBlock(i)); + messages[i] = ReadBlock(block); + } + auto read_messages = All(std::move(messages)); + if (executor_) read_messages = executor_->Transfer(read_messages); + read_dictionaries_ = read_messages.Then( + [=](const std::vector>>& maybe_messages) + -> Status { + ARROW_ASSIGN_OR_RAISE(auto messages, + arrow::internal::UnwrapOrRaise(maybe_messages)); + return state->ReadDictionaries(messages); + }); + } } if (index_ >= state_->num_record_batches()) { return Future::MakeFinished(IterationTraits::End()); } - auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++)); + ARROW_ASSIGN_OR_RAISE(auto block, state->GetRecordBatchBlock(index_++)); auto read_message = ReadBlock(block); auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; }); // Force transfer. This may be wasteful in some cases, but ensures we get off the @@ -1958,16 +2020,6 @@ Future> WholeIpcFileRecordBatchGenerator::ReadBlock( } } -Status WholeIpcFileRecordBatchGenerator::ReadDictionaries( - RecordBatchFileReaderImpl* state, - std::vector> dictionary_messages) { - IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_); - for (const auto& message : dictionary_messages) { - RETURN_NOT_OK(state->ReadOneDictionary(message.get(), context)); - } - return Status::OK(); -} - Result> WholeIpcFileRecordBatchGenerator::ReadRecordBatch( RecordBatchFileReaderImpl* state, Message* message) { CHECK_HAS_BODY(*message); @@ -2619,6 +2671,14 @@ Status ValidateFuzzBatch(const RecordBatch& batch) { return st; } +Status ValidateFuzzBatch(const RecordBatchWithMetadata& batch) { + if (batch.batch) { + RETURN_NOT_OK(ValidateFuzzBatch(*batch.batch)); + } + // XXX do something with custom metadata? + return Status::OK(); +} + IpcReadOptions FuzzingOptions() { IpcReadOptions options; options.memory_pool = ::arrow::internal::fuzzing_memory_pool(); @@ -2637,12 +2697,12 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status st; while (true) { - std::shared_ptr batch; - RETURN_NOT_OK(batch_reader->ReadNext(&batch)); - if (batch == nullptr) { + ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadNext()); + if (!batch.batch && !batch.custom_metadata) { + // EOS break; } - st &= ValidateFuzzBatch(*batch); + st &= ValidateFuzzBatch(batch); } return st; @@ -2650,20 +2710,36 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status FuzzIpcFile(const uint8_t* data, int64_t size) { auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - std::shared_ptr batch_reader; - ARROW_ASSIGN_OR_RAISE(batch_reader, - RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); - Status st; + Status final_status; - const int n_batches = batch_reader->num_record_batches(); - for (int i = 0; i < n_batches; ++i) { - ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadRecordBatch(i)); - st &= ValidateFuzzBatch(*batch); + auto do_read = [&](bool pre_buffer) { + io::BufferReader buffer_reader(buffer); + ARROW_ASSIGN_OR_RAISE(auto batch_reader, + RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); + if (pre_buffer) { + // Pre-buffer all record batches + RETURN_NOT_OK(batch_reader->PreBufferMetadata(/*indices=*/{})); + } + + const int n_batches = batch_reader->num_record_batches(); + for (int i = 0; i < n_batches; ++i) { + RecordBatchWithMetadata batch; + auto st = batch_reader->ReadRecordBatchWithCustomMetadata(i).Value(&batch); + final_status &= st; + if (!st.ok()) { + continue; + } + final_status &= ValidateFuzzBatch(batch); + } + return Status::OK(); + }; + + for (const bool pre_buffer : {false, true}) { + final_status &= do_read(pre_buffer); } - return st; + return final_status; } Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 02e6b816c0b..ceca6d9e434 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -368,19 +369,27 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo return builder.Finish(out); } -template -static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, - MemoryPool* pool, - std::shared_ptr* out) { - BuilderType builder(pool); +template BuilderType> +static Result> MakeBinaryArrayWithUniqueValues( + BuilderType builder, int64_t length, bool include_nulls) { + if constexpr (std::is_base_of_v) { + // Try to emit several variadic buffers by choosing a small block size. + builder.SetBlockSize(512); + } for (int64_t i = 0; i < length; ++i) { if (include_nulls && (i % 7 == 0)) { RETURN_NOT_OK(builder.AppendNull()); } else { - RETURN_NOT_OK(builder.Append(std::to_string(i))); + // Make sure that some strings are long enough to have non-inline binary views + const auto base = std::to_string(i); + std::string value; + for (int64_t j = 0; j < 3 * (i % 10); ++j) { + value += base; + } + RETURN_NOT_OK(builder.Append(value)); } } - return builder.Finish(out); + return builder.Finish(); } Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, @@ -390,22 +399,22 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_n ArrayVector arrays; FieldVector fields; - auto AppendColumn = [&](auto& MakeArray) { - arrays.emplace_back(); - RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); - - const auto& type = arrays.back()->type(); - fields.push_back(field(type->ToString(), type)); + auto AppendColumn = [&](auto builder) { + ARROW_ASSIGN_OR_RAISE(auto array, MakeBinaryArrayWithUniqueValues( + std::move(builder), length, with_nulls)); + arrays.push_back(array); + fields.push_back(field(array->type()->ToString(), array->type())); return Status::OK(); }; - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + auto pool = default_memory_pool(); + RETURN_NOT_OK(AppendColumn(StringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeStringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeBinaryBuilder(pool))); if (with_view_types) { - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(StringViewBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryViewBuilder(pool))); } *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); diff --git a/cpp/src/arrow/memory_pool_test.cc b/cpp/src/arrow/memory_pool_test.cc index 20006ebeb49..0af1ed2d9ec 100644 --- a/cpp/src/arrow/memory_pool_test.cc +++ b/cpp/src/arrow/memory_pool_test.cc @@ -242,10 +242,10 @@ TEST(Jemalloc, GetAllocationStats) { // Check allocated stats change due to allocation ASSERT_NEAR(allocated - allocated0, 70000, 50000); - ASSERT_NEAR(active - active0, 100000, 90000); - ASSERT_NEAR(metadata - metadata0, 500, 460); - ASSERT_NEAR(resident - resident0, 120000, 110000); - ASSERT_NEAR(mapped - mapped0, 100000, 90000); + ASSERT_GE(active - active0, allocated - allocated0); + ASSERT_GT(metadata, metadata0); + ASSERT_GE(resident - resident0, allocated - allocated0); + ASSERT_GE(mapped - mapped0, allocated - allocated0); ASSERT_NEAR(retained - retained0, 0, 40000); ASSERT_NEAR(thread_peak_read - thread_peak_read0, 1024, 700); diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 1162b4c3bb0..12e0f553b74 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -266,10 +266,13 @@ Result> RecordBatch::FromStructArray( namespace { Status ValidateColumnLength(const RecordBatch& batch, int i) { - const auto& array = *batch.column(i); - if (ARROW_PREDICT_FALSE(array.length() != batch.num_rows())) { + // This function is part of the validation code path and should + // be robust against invalid data, but `column()` would call MakeArray() + // that can abort on invalid data. + const auto& array = *batch.column_data(i); + if (ARROW_PREDICT_FALSE(array.length != batch.num_rows())) { return Status::Invalid("Number of rows in column ", i, - " did not match batch: ", array.length(), " vs ", + " did not match batch: ", array.length, " vs ", batch.num_rows()); } return Status::OK(); @@ -455,11 +458,12 @@ namespace { Status ValidateBatch(const RecordBatch& batch, bool full_validation) { for (int i = 0; i < batch.num_columns(); ++i) { RETURN_NOT_OK(ValidateColumnLength(batch, i)); - const auto& array = *batch.column(i); + // See ValidateColumnLength about avoiding a ArrayData -> Array conversion + const auto& array = *batch.column_data(i); const auto& schema_type = batch.schema()->field(i)->type(); - if (!array.type()->Equals(schema_type)) { + if (!array.type->Equals(schema_type)) { return Status::Invalid("Column ", i, - " type not match schema: ", array.type()->ToString(), " vs ", + " type not match schema: ", array.type->ToString(), " vs ", schema_type->ToString()); } const auto st = full_validation ? internal::ValidateArrayFull(array) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4516b808a84..a037d7261ef 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -318,7 +318,6 @@ TEST_F(TestRecordBatch, Validate) { auto a3 = gen.ArrayOf(int16(), 5); auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK(b1->ValidateFull()); // Length mismatch @@ -328,6 +327,21 @@ TEST_F(TestRecordBatch, Validate) { // Type mismatch auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); ASSERT_RAISES(Invalid, b3->ValidateFull()); + + // Invalid column data (nulls in map key array) that would abort on MakeArray + auto map_field = field("f", map(utf8(), int32())); + schema = ::arrow::schema({map_field}); + auto map_key_data = ArrayFromJSON(utf8(), "[null]")->data(); + auto map_item_data = ArrayFromJSON(int32(), "[null]")->data(); + auto map_data = ArrayData::Make(map_field->type(), /*length=*/1, /*buffers=*/{nullptr}, + /*child_data=*/{map_key_data, map_item_data}); + + auto b4 = RecordBatch::Make(schema, /*num_rows=*/map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b4->ValidateFull()); + + // Length mismatch with a column data that would also fail on MakeArray + auto b5 = RecordBatch::Make(schema, /*num_rows=*/1 + map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b5->Validate()); } TEST_F(TestRecordBatch, Slice) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f68d2dcb619..e3582056ead 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2575,6 +2575,16 @@ constexpr bool may_have_validity_bitmap(Type::type id) { } } +constexpr bool has_variadic_buffers(Type::type id) { + switch (id) { + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return true; + default: + return false; + } +} + ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.") constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); } diff --git a/cpp/src/arrow/util/int_util_overflow.h b/cpp/src/arrow/util/int_util_overflow.h index 93066fecafa..69714a935a4 100644 --- a/cpp/src/arrow/util/int_util_overflow.h +++ b/cpp/src/arrow/util/int_util_overflow.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include #include "arrow/status.h" @@ -162,6 +164,37 @@ NON_GENERIC_OPS_WITH_OVERFLOW(DivideWithOverflow) #undef NON_GENERIC_OPS_WITH_OVERFLOW #undef NON_GENERIC_OP_WITH_OVERFLOW +// Convenience functions over an arbitrary number of arguments +template +std::optional AddWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(AddWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + +template +std::optional MultiplyWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(MultiplyWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + // Define function NegateWithOverflow with the signature `bool(T u, T* out)` // where T is a signed integer type. On overflow, these functions return true. // Otherwise, false is returned and `out` is updated with the result of the diff --git a/cpp/src/arrow/util/int_util_test.cc b/cpp/src/arrow/util/int_util_test.cc index 7217c1097e4..cffa4e9d15e 100644 --- a/cpp/src/arrow/util/int_util_test.cc +++ b/cpp/src/arrow/util/int_util_test.cc @@ -649,5 +649,23 @@ TYPED_TEST(TestAddWithOverflow, Basics) { this->CheckOk(almost_min, almost_max + T{2}, T{1}); } +TEST(AddWithOverflow, Variadic) { + ASSERT_EQ(AddWithOverflow({}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 3}), 6); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), std::nullopt); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), 128); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), 128); +} + +TEST(MultiplyWithOverflow, Variadic) { + ASSERT_EQ(MultiplyWithOverflow({}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({1, 2, 3, 4}), 24); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), 128); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), 128); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 1a8e8066d70..0cc71f276df 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -35,7 +35,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, float* out) ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } bool StringToFloat(const char* s, size_t length, char decimal_point, double* out) { @@ -43,7 +46,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } // Half float @@ -53,7 +59,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, Float16* ou float temp_out; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); - const bool ok = res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + const bool ok = is_valid_number && consumed_entire_string; if (ok) { *out = Float16::FromFloat(temp_out); } diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index a67f1d97f17..3525a010b63 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -141,6 +141,10 @@ TEST(StringConversion, ToFloat) { AssertConversion("0", 0.0f); AssertConversion("-0.0", -0.0f); AssertConversion("-1e20", -1e20f); + AssertConversion("4e38", std::numeric_limits::infinity()); + AssertConversion("-4e38", -std::numeric_limits::infinity()); + AssertConversion("1e-46", 0.0f); + AssertConversion("-1e-46", -0.0f); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -166,6 +170,10 @@ TEST(StringConversion, ToDouble) { AssertConversion("0", 0); AssertConversion("-0.0", -0.0); AssertConversion("-1e100", -1e100); + AssertConversion("2e308", std::numeric_limits::infinity()); + AssertConversion("-2e308", -std::numeric_limits::infinity()); + AssertConversion("1e-325", 0.0); + AssertConversion("-1e-325", -0.0); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -185,6 +193,10 @@ TEST(StringConversion, ToHalfFloat) { AssertConversion("0", Float16(0.0f)); AssertConversion("-0.0", Float16(-0.0f)); AssertConversion("-1e15", Float16(-1e15)); + AssertConversion("7e4", Float16::FromBits(0x7c00)); + AssertConversion("-7e4", Float16::FromBits(0xfc00)); + AssertConversion("1e-9", Float16(0.0f)); + AssertConversion("-1e-9", Float16(-0.0f)); AssertConversion("+Infinity", Float16::FromBits(0x7c00)); AssertConversion("-Infinity", Float16::FromBits(0xfc00)); AssertConversion("Infinity", Float16::FromBits(0x7c00)); diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index e5760243b39..31a86d5da9d 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -274,6 +274,7 @@ add_gandiva_test(internals-test hash_utils_test.cc gdv_function_stubs_test.cc interval_holder_test.cc + target_datalayout_test.cc tests/test_util.cc EXTRA_LINK_LIBS re2::re2 diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 64ed433a686..a718a800605 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -330,6 +330,7 @@ Engine::Engine(const std::shared_ptr& conf, // LLVM 10 doesn't like the expr function name to be the same as the module name auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); module_ = std::make_unique(module_id, *context_); + module_->setDataLayout(target_machine_->createDataLayout()); } Engine::~Engine() {} diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 4e6480fa167..0f0918b3a1c 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -399,8 +399,13 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, if (output_type_id == arrow::Type::BOOL) { SetPackedBitValue(output_ref, loop_var, output_value->data()); - } else if (arrow::is_primitive(output_type_id) || - output_type_id == arrow::Type::DECIMAL) { + } else if (output_type_id == arrow::Type::DECIMAL) { + // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned. + // Use CreateAlignedStore with 8-byte alignment to match Arrow's actual alignment. + auto slot_offset = + builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); + builder->CreateAlignedStore(output_value->data(), slot_offset, llvm::MaybeAlign(8)); + } else if (arrow::is_primitive(output_type_id)) { auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); builder->CreateStore(output_value->data(), slot_offset); @@ -602,7 +607,12 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { case arrow::Type::DECIMAL: { auto slot_offset = builder->CreateGEP(types->i128_type(), slot_ref, slot_index); - slot_value = builder->CreateLoad(types->i128_type(), slot_offset, dex.FieldName()); + // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned. + // Using CreateLoad with default alignment (16 for i128) causes crashes on + // misaligned data. Use CreateAlignedLoad with 8-byte alignment to match Arrow's + // actual alignment. + slot_value = builder->CreateAlignedLoad( + types->i128_type(), slot_offset, llvm::MaybeAlign(8), false, dex.FieldName()); lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType()); break; } diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 0b31c769c99..3e786d1b112 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -841,7 +841,12 @@ const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_le *out_len = 0; return ""; } - *out_len = repeat_number * in_len; + if (ARROW_PREDICT_FALSE( + arrow::internal::MultiplyWithOverflow(repeat_number, in_len, out_len))) { + gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); + *out_len = 0; + return ""; + } char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index aaa25db0a9f..c418f9077a7 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -387,6 +387,13 @@ TEST(TestStringOps, TestRepeat) { EXPECT_EQ(std::string(out_str, out_len), ""); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Repeat number can't be negative")); ctx.Reset(); + + out_str = repeat_utf8_int32(ctx_ptr, "aa", 2, + std::numeric_limits::max() / 2 + 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Would overflow maximum output size")); + ctx.Reset(); } TEST(TestStringOps, TestCastBoolToVarchar) { diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc new file mode 100644 index 00000000000..0b32c6caf96 --- /dev/null +++ b/cpp/src/gandiva/target_datalayout_test.cc @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gandiva/llvm_generator.h" +#include "gandiva/tests/test_util.h" + +namespace gandiva { + +// Test that verifies the target data layout string representation +// is populated. +TEST(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); + + llvm::Module* module = generator->module(); + ASSERT_NE(module, nullptr); + + const llvm::DataLayout& data_layout = module->getDataLayout(); + std::string data_layout_str = data_layout.getStringRepresentation(); + + ASSERT_FALSE(data_layout_str.empty()); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 68138f50d81..356b976e005 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -20,6 +20,7 @@ add_gandiva_test(projector-test binary_test.cc boolean_expr_test.cc date_time_test.cc + decimal_alignment_test.cc decimal_single_test.cc decimal_test.cc filter_project_test.cc diff --git a/cpp/src/gandiva/tests/decimal_alignment_test.cc b/cpp/src/gandiva/tests/decimal_alignment_test.cc new file mode 100644 index 00000000000..3028ec81f31 --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_alignment_test.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Test for decimal128 alignment issue fix. +// Arrow decimal128 data may be 8-byte aligned but not 16-byte aligned. +// This test verifies that Gandiva handles such data correctly. + +#include + +#include "arrow/array/array_decimal.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/decimal.h" + +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +class TestDecimalAlignment : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +// Create a decimal128 array with data at a specific alignment offset +// This simulates the real-world scenario where Arrow data from external sources +// (like JNI/Java) may not be 16-byte aligned. +std::shared_ptr MakeMisalignedDecimalArray( + const std::shared_ptr& type, + const std::vector& values, int alignment_offset) { + // Allocate buffer with extra space for misalignment + int64_t data_size = values.size() * 16; // 16 bytes per Decimal128 + int64_t buffer_size = data_size + 16; // Extra space for offset + + std::shared_ptr buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer)); + + // Calculate the starting offset to achieve desired alignment + // We want the data to be 8-byte aligned but NOT 16-byte aligned + uint8_t* raw_data = buffer->mutable_data(); + uintptr_t addr = reinterpret_cast(raw_data); + + // Find offset to get to 8-byte aligned but not 16-byte aligned address + int offset_to_8 = (8 - (addr % 8)) % 8; + int current_16_alignment = (addr + offset_to_8) % 16; + + int final_offset; + if (alignment_offset == 8) { + // Want 8-byte aligned but NOT 16-byte aligned + if (current_16_alignment == 0) { + final_offset = offset_to_8 + 8; // Add 8 to break 16-byte alignment + } else { + final_offset = offset_to_8; + } + } else { + // Want 16-byte aligned + final_offset = (16 - (addr % 16)) % 16; + } + + // Copy decimal values to the offset location + uint8_t* data_start = raw_data + final_offset; + for (size_t i = 0; i < values.size(); i++) { + memcpy(data_start + i * 16, values[i].ToBytes().data(), 16); + } + + // Verify alignment + uintptr_t data_addr = reinterpret_cast(data_start); + EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned"; + if (alignment_offset == 8) { + EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned"; + } + + // Create a sliced buffer starting at our offset + auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size); + + // Create validity buffer (all valid) + std::shared_ptr validity_buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer((values.size() + 7) / 8).Value(&validity_buffer)); + memset(validity_buffer->mutable_data(), 0xFF, validity_buffer->size()); + + // Create the array with our misaligned data buffer + auto array_data = arrow::ArrayData::Make(type, static_cast(values.size()), + {validity_buffer, sliced_buffer}); + + return std::make_shared(array_data); +} + +// Test that decimal operations work correctly with 8-byte aligned (but not 16-byte +// aligned) data +TEST_F(TestDecimalAlignment, TestMisalignedDecimalSubtract) { + constexpr int32_t precision = 38; + constexpr int32_t scale = 17; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = arrow::field("a", decimal_type); + auto field_b = arrow::field("b", decimal_type); + auto schema = arrow::schema({field_a, field_b}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type); + ASSERT_OK(status); + + auto res = arrow::field("res", output_type); + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto subtract = + TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type); + auto expr = TreeExprBuilder::MakeExpression(subtract, res); + + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + ASSERT_OK(status); + + // Create test data + std::vector values_a = {Decimal128(100), Decimal128(200), Decimal128(300)}; + std::vector values_b = {Decimal128(10), Decimal128(20), Decimal128(30)}; + + // Create arrays with 8-byte alignment (but NOT 16-byte aligned) + auto array_a = MakeMisalignedDecimalArray(decimal_type, values_a, 8); + auto array_b = MakeMisalignedDecimalArray(decimal_type, values_b, 8); + + auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b}); + + // This should NOT crash even with misaligned data + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + ASSERT_OK(status); + + // Verify results: 100-10=90, 200-20=180, 300-30=270 + auto result = std::dynamic_pointer_cast(outputs[0]); + ASSERT_NE(result, nullptr); + EXPECT_EQ(result->length(), 3); +} + +// Create a misaligned output buffer for decimal128 +std::shared_ptr MakeMisalignedDecimalOutput( + const std::shared_ptr& type, int64_t num_records, + int alignment_offset) { + // Allocate data buffer with extra space for misalignment + int64_t data_size = num_records * 16; // 16 bytes per Decimal128 + int64_t buffer_size = data_size + 16; // Extra space for offset + + std::shared_ptr buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer)); + + uint8_t* raw_data = const_cast(buffer->data()); + uintptr_t addr = reinterpret_cast(raw_data); + + // Find offset to get to 8-byte aligned but not 16-byte aligned address + int offset_to_8 = (8 - (addr % 8)) % 8; + int current_16_alignment = (addr + offset_to_8) % 16; + + int final_offset; + if (alignment_offset == 8) { + if (current_16_alignment == 0) { + final_offset = offset_to_8 + 8; + } else { + final_offset = offset_to_8; + } + } else { + final_offset = (16 - (addr % 16)) % 16; + } + + // Verify alignment + uintptr_t data_addr = reinterpret_cast(raw_data + final_offset); + EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned"; + if (alignment_offset == 8) { + EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned"; + } + + auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size); + + // Create validity buffer + int64_t bitmap_size = (num_records + 7) / 8; + std::shared_ptr validity_buffer; + ARROW_EXPECT_OK(arrow::AllocateBuffer(bitmap_size).Value(&validity_buffer)); + memset(const_cast(validity_buffer->data()), 0xFF, validity_buffer->size()); + + return arrow::ArrayData::Make(type, num_records, {validity_buffer, sliced_buffer}); +} + +// Test that decimal STORES work correctly with 8-byte aligned (but not 16-byte aligned) +// output +TEST_F(TestDecimalAlignment, TestMisalignedDecimalStore) { + constexpr int32_t precision = 38; + constexpr int32_t scale = 17; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = arrow::field("a", decimal_type); + auto field_b = arrow::field("b", decimal_type); + auto schema = arrow::schema({field_a, field_b}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type); + ASSERT_OK(status); + + auto res = arrow::field("res", output_type); + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto subtract = + TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type); + auto expr = TreeExprBuilder::MakeExpression(subtract, res); + + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + ASSERT_OK(status); + + // Create ALIGNED input arrays (using standard Arrow allocation) + auto array_a = MakeArrowArrayDecimal( + decimal_type, {Decimal128(100), Decimal128(200), Decimal128(300)}, + {true, true, true}); + auto array_b = MakeArrowArrayDecimal( + decimal_type, {Decimal128(10), Decimal128(20), Decimal128(30)}, {true, true, true}); + + auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b}); + + // Create MISALIGNED output buffer (8-byte aligned but NOT 16-byte aligned) + auto output_data = MakeMisalignedDecimalOutput(output_type, 3, 8); + + // This should NOT crash even with misaligned output buffer + status = projector->Evaluate(*in_batch, {output_data}); + ASSERT_OK(status); + + // Verify the output was written correctly + auto result = std::make_shared(output_data); + EXPECT_EQ(result->length(), 3); +} + +} // namespace gandiva diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b246feaf732..af7ccfd7ad7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow; namespace parquet { +using ::arrow::Future; +using ::arrow::Result; +using ::arrow::Status; + namespace { bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { // Check the encoding_stats to see if all data pages are dictionary encoded. @@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); } - ::arrow::Result> GetReadRanges( + Result> GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit, int64_t range_size_limit) { std::vector<::arrow::io::ReadRange> ranges; @@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents { range_size_limit); } - ::arrow::Future<> WhenBuffered(const std::vector& row_groups, - const std::vector& column_indices) const { + Future<> WhenBuffered(const std::vector& row_groups, + const std::vector& column_indices) const { if (!cached_source_) { - return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered"); + return Status::Invalid("Must call PreBuffer before WhenBuffered"); } std::vector<::arrow::io::ReadRange> ranges; for (int row : row_groups) { @@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents { // Fall through } - const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( - metadata_buffer, metadata_len, std::move(file_decryptor)); - auto file_decryption_properties = properties_.file_decryption_properties(); - if (is_encrypted_footer) { - // Nothing else to do here. - return; - } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. - if (file_decryption_properties != nullptr) { - if (!file_decryption_properties->plaintext_files_allowed()) { - throw ParquetException("Applying decryption properties on plaintext file"); - } - } - } else { - // Encrypted file with plaintext footer mode. - ParseMetaDataOfEncryptedFileWithPlaintextFooter( - file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); - } + ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer, + std::move(file_decryptor)); } // Validate the source size and get the initial read size. @@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents { } // Does not throw. - ::arrow::Future<> ParseMetaDataAsync() { + Future<> ParseMetaDataAsync() { int64_t footer_read_size; BEGIN_PARQUET_CATCH_EXCEPTIONS footer_read_size = GetFooterReadSize(); END_PARQUET_CATCH_EXCEPTIONS // Assumes this is kept alive externally return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size) - .Then([this, - footer_read_size](const std::shared_ptr<::arrow::Buffer>& footer_buffer) - -> ::arrow::Future<> { + .Then([this, footer_read_size]( + const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> Future<> { uint32_t metadata_len; BEGIN_PARQUET_CATCH_EXCEPTIONS metadata_len = ParseFooterLength(footer_buffer, footer_read_size); @@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents { } // Continuation - ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync( + Future<> ParseMaybeEncryptedMetaDataAsync( std::shared_ptr<::arrow::Buffer> footer_buffer, std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size, uint32_t metadata_len) { @@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor = std::move(file_decryptor)]( const std::shared_ptr<::arrow::Buffer>& metadata_buffer) { // Continue and read the file footer - return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, - file_decryptor); + BEGIN_PARQUET_CATCH_EXCEPTIONS + ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, + file_decryptor); + END_PARQUET_CATCH_EXCEPTIONS + return Status::OK(); }); } - return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, - is_encrypted_footer, std::move(file_decryptor)); + BEGIN_PARQUET_CATCH_EXCEPTIONS + ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer, + std::move(file_decryptor)); + END_PARQUET_CATCH_EXCEPTIONS + return Status::OK(); } // Continuation - ::arrow::Status ParseMetaDataFinal( - std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len, - const bool is_encrypted_footer, - std::shared_ptr file_decryptor) { - BEGIN_PARQUET_CATCH_EXCEPTIONS + void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer, + uint32_t metadata_len, const bool is_encrypted_footer, + std::shared_ptr file_decryptor) { const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties(); if (is_encrypted_footer) { // Nothing else to do here. - return ::arrow::Status::OK(); + return; } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. if (file_decryption_properties != nullptr) { if (!file_decryption_properties->plaintext_files_allowed()) { @@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents { ParseMetaDataOfEncryptedFileWithPlaintextFooter( file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); } - END_PARQUET_CATCH_EXCEPTIONS - return ::arrow::Status::OK(); } private: @@ -707,20 +697,16 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( auto file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); - // set the InternalFileDecryptor in the metadata as well, as it's used - // for signature verification and for ColumnChunkMetaData creation. - file_metadata_->set_file_decryptor(std::move(file_decryptor)); + // Set the InternalFileDecryptor in the metadata as well, as it's used + // for ColumnChunkMetaData creation. + file_metadata_->set_file_decryptor(file_decryptor); if (file_decryption_properties->check_plaintext_footer_integrity()) { - if (metadata_len - read_metadata_len != - (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) { - throw ParquetInvalidOrCorruptedFileException( - "Failed reading metadata for encryption signature (requested ", - parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength, - " bytes but have ", metadata_len - read_metadata_len, " bytes)"); - } - - if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) { + auto serialized_metadata = + metadata_buffer->span_as().subspan(0, read_metadata_len); + auto signature = metadata_buffer->span_as().subspan(read_metadata_len); + if (!FileMetaData::VerifySignature(serialized_metadata, signature, + file_decryptor.get())) { throw ParquetInvalidOrCorruptedFileException( "Parquet crypto signature verification failed"); } @@ -804,7 +790,7 @@ std::unique_ptr ParquetFileReader::Contents::Open( return result; } -::arrow::Future> +Future> ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, const ReaderProperties& props, std::shared_ptr metadata) { @@ -815,7 +801,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, if (metadata == nullptr) { // TODO(ARROW-12259): workaround since we have Future<(move-only type)> struct { - ::arrow::Result> operator()() { + Result> operator()() { return std::move(result); } @@ -825,7 +811,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr source, return file->ParseMetaDataAsync().Then(std::move(Continuation)); } else { file->set_metadata(std::move(metadata)); - return ::arrow::Future>::MakeFinished( + return Future>::MakeFinished( std::move(result)); } END_PARQUET_CATCH_EXCEPTIONS @@ -855,24 +841,24 @@ std::unique_ptr ParquetFileReader::OpenFile( return Open(std::move(source), props, std::move(metadata)); } -::arrow::Future> ParquetFileReader::OpenAsync( +Future> ParquetFileReader::OpenAsync( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props, std::shared_ptr metadata) { BEGIN_PARQUET_CATCH_EXCEPTIONS auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata)); // TODO(ARROW-12259): workaround since we have Future<(move-only type)> - auto completed = ::arrow::Future>::Make(); - fut.AddCallback([fut, completed]( - const ::arrow::Result>& - contents) mutable { - if (!contents.ok()) { - completed.MarkFinished(contents.status()); - return; - } - std::unique_ptr result = std::make_unique(); - result->Open(fut.MoveResult().MoveValueUnsafe()); - completed.MarkFinished(std::move(result)); - }); + auto completed = Future>::Make(); + fut.AddCallback( + [fut, completed]( + const Result>& contents) mutable { + if (!contents.ok()) { + completed.MarkFinished(contents.status()); + return; + } + std::unique_ptr result = std::make_unique(); + result->Open(fut.MoveResult().MoveValueUnsafe()); + completed.MarkFinished(std::move(result)); + }); return completed; END_PARQUET_CATCH_EXCEPTIONS } @@ -919,7 +905,7 @@ void ParquetFileReader::PreBuffer(const std::vector& row_groups, file->PreBuffer(row_groups, column_indices, ctx, options); } -::arrow::Result> ParquetFileReader::GetReadRanges( +Result> ParquetFileReader::GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit, int64_t range_size_limit) { // Access private methods here @@ -929,8 +915,8 @@ ::arrow::Result> ParquetFileReader::GetReadR range_size_limit); } -::arrow::Future<> ParquetFileReader::WhenBuffered( - const std::vector& row_groups, const std::vector& column_indices) const { +Future<> ParquetFileReader::WhenBuffered(const std::vector& row_groups, + const std::vector& column_indices) const { // Access private methods here SerializedFile* file = ::arrow::internal::checked_cast(contents_.get()); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 42dd8e52ee9..03a8a4c4604 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1169,6 +1169,42 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, return impl_->WriteTo(dst, encryptor); } +bool FileMetaData::VerifySignature(std::span serialized_metadata, + std::span signature, + InternalFileDecryptor* file_decryptor) { + DCHECK_NE(file_decryptor, nullptr); + + // In plaintext footer, the "signature" is the concatenation of the nonce used + // for GCM encryption, and the authentication tag obtained after GCM encryption. + if (signature.size() != encryption::kGcmTagLength + encryption::kNonceLength) { + throw ParquetInvalidOrCorruptedFileException( + "Invalid footer encryption signature (expected ", + encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ", + signature.size(), ")"); + } + + // Encrypt plaintext serialized metadata so as to compute its signature + auto nonce = signature.subspan(0, encryption::kNonceLength); + auto tag = signature.subspan(encryption::kNonceLength); + const SecureString& key = file_decryptor->GetFooterKey(); + const std::string& aad = encryption::CreateFooterAad(file_decryptor->file_aad()); + + auto aes_encryptor = encryption::AesEncryptor::Make( + file_decryptor->algorithm(), static_cast(key.size()), /*metadata=*/true, + /*write_length=*/false); + + std::shared_ptr encrypted_buffer = + AllocateBuffer(file_decryptor->pool(), + aes_encryptor->CiphertextLength(serialized_metadata.size())); + int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt( + serialized_metadata, key.as_span(), str2span(aad), nonce, + encrypted_buffer->mutable_span_as()); + DCHECK_EQ(encrypted_len, encrypted_buffer->size()); + // Check computed signature against expected + return 0 == memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength, + tag.data(), encryption::kGcmTagLength); +} + class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() = default; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 3380adbf56a..1235aae9ad7 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -331,8 +332,8 @@ class PARQUET_EXPORT FileMetaData { EncryptionAlgorithm encryption_algorithm() const; const std::string& footer_signing_key_metadata() const; - /// \brief Verify signature of FileMetaData when file is encrypted but footer - /// is not encrypted (plaintext footer). + PARQUET_DEPRECATED( + "Deprecated in 24.0.0. If you need this functionality, please report an issue.") bool VerifySignature(const void* signature); void WriteTo(::arrow::io::OutputStream* dst, @@ -392,6 +393,11 @@ class PARQUET_EXPORT FileMetaData { void set_file_decryptor(std::shared_ptr file_decryptor); const std::shared_ptr& file_decryptor() const; + // Verify the signature of a plaintext footer. + static bool VerifySignature(std::span serialized_metadata, + std::span signature, + InternalFileDecryptor* file_decryptor); + // PIMPL Idiom FileMetaData(); class FileMetaDataImpl; diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 41c40fcc85f..3d636798234 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version-string": "23.0.0-SNAPSHOT", + "version-string": "23.0.1", "dependencies": [ "abseil", { diff --git a/dev/archery/archery/ci/cli.py b/dev/archery/archery/ci/cli.py index bf7b68d5327..5597dff733e 100644 --- a/dev/archery/archery/ci/cli.py +++ b/dev/archery/archery/ci/cli.py @@ -73,6 +73,22 @@ def report_chat(obj, workflow_id, send, repository, ignore, webhook, output.write(report_chat.render("workflow_report")) +class WorkflowEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('workflow_report', **kwargs) + + def date(self): + return self.report.datetime + + def subject(self): + workflow = self.report + date = self.date().strftime('%Y-%m-%d') + return ( + f'[{date}] Arrow Build Report for Job {workflow.name}: ' + f'{len(workflow.failed_jobs())} failed' + ) + + @ci.command() @click.argument('workflow_id', required=True) @click.option('--sender-name', '-n', @@ -105,9 +121,10 @@ def report_email(obj, workflow_id, sender_name, sender_email, recipient_email, """ output = obj['output'] - email_report = EmailReport( - report=Workflow(workflow_id, repository, - ignore_job=ignore, gh_token=obj['github_token']), + workflow = Workflow(workflow_id, repository, + ignore_job=ignore, gh_token=obj['github_token']) + email_report = WorkflowEmailReport( + report=workflow, sender_name=sender_name, sender_email=sender_email, recipient_email=recipient_email @@ -119,8 +136,7 @@ def report_email(obj, workflow_id, sender_name, sender_email, recipient_email, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=email_report.render("workflow_report") + report=email_report ) else: - output.write(email_report.render("workflow_report")) + output.write(str(email_report.render())) diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py index c73c4d1ff7e..10aa3dedf44 100644 --- a/dev/archery/archery/crossbow/cli.py +++ b/dev/archery/archery/crossbow/cli.py @@ -343,6 +343,22 @@ def latest_prefix(obj, prefix, fetch): click.echo(latest.branch) +class NightlyEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('nightly_report', **kwargs) + + def subject(self): + report = self.report + n_errors = len(report.tasks_by_state['error']) + n_failures = len(report.tasks_by_state['failure']) + n_pendings = len(report.tasks_by_state['pending']) + return ( + f'[NIGHTLY] Arrow Build Report for Job {report.job.branch}: ' + f'{n_errors + n_failures} failed, ' + f'{n_pendings} pending' + ) + + @crossbow.command() @click.argument('job-name', required=True) @click.option('--sender-name', '-n', @@ -382,8 +398,9 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, queue.fetch() job = queue.get(job_name) - email_report = EmailReport( - report=Report(job), + report = Report(job) + email_report = NightlyEmailReport( + report=report, sender_name=sender_name, sender_email=sender_email, recipient_email=recipient_email @@ -401,11 +418,10 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=email_report.render("nightly_report") + report=email_report ) else: - output.write(email_report.render("nightly_report")) + output.write(str(email_report.render())) @crossbow.command() @@ -601,6 +617,17 @@ def batch_gen(iterable, step): print(batch) +class TokenExpirationEmailReport(EmailReport): + def __init__(self, **kwargs): + super().__init__('token_expiration', **kwargs) + + def subject(self): + token_expiration_date = self.report.token_expiration_date + return ( + f'[CI] Arrow Crossbow Token Expiration in {token_expiration_date}' + ) + + @crossbow.command() @click.option('--days', default=30, help='Notification will be sent if expiration date is ' @@ -645,23 +672,18 @@ def __init__(self, token_expiration_date, days_left): self.token_expiration_date = token_expiration_date self.days_left = days_left - email_report = EmailReport( - report=TokenExpirationReport( - token_expiration_date or "ALREADY_EXPIRED", days_left), - sender_name=sender_name, - sender_email=sender_email, - recipient_email=recipient_email - ) + if not token_expiration_date: + token_expiration_date = 'ALREADY_EXPIRED' + report = TokenExpirationReport(token_expiration_date, days_left) + email_report = TokenExpirationEmailReport(report) - message = email_report.render("token_expiration").strip() if send: ReportUtils.send_email( smtp_user=smtp_user, smtp_password=smtp_password, smtp_server=smtp_server, smtp_port=smtp_port, - recipient_email=recipient_email, - message=message + report=email_report ) else: - output.write(message) + output.write(str(email_report.render())) diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index 32962410d6e..1c6510ea4f3 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -17,6 +17,10 @@ import collections import csv +import datetime +import email.headerregistry +import email.message +import email.utils import operator import fnmatch import functools @@ -246,7 +250,7 @@ def send_message(cls, webhook, message): @classmethod def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, - recipient_email, message): + report): from smtplib import SMTP, SMTP_SSL if smtp_port == 465: @@ -259,7 +263,8 @@ def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, else: smtp.starttls() smtp.login(smtp_user, smtp_password) - smtp.sendmail(smtp_user, recipient_email, message) + message = report.render() + smtp.send_message(message) @classmethod def write_csv(cls, report, add_headers=True): @@ -271,11 +276,6 @@ def write_csv(cls, report, add_headers=True): class EmailReport(JinjaReport): - templates = { - 'nightly_report': 'email_nightly_report.txt.j2', - 'token_expiration': 'email_token_expiration.txt.j2', - 'workflow_report': 'email_workflow_report.txt.j2', - } fields = [ 'report', 'sender_name', @@ -283,6 +283,35 @@ class EmailReport(JinjaReport): 'recipient_email', ] + def __init__(self, template_name, **kwargs): + self._template_name = template_name + super().__init__(**kwargs) + + @property + def templates(self): + return { + self._template_name: f'email_{self._template_name}.txt.j2', + } + + def date(self): + return None + + def render(self): + message = email.message.EmailMessage() + message.set_charset('utf-8') + message['Message-Id'] = email.utils.make_msgid() + date = self.date() + if isinstance(date, datetime.datetime): + message['Date'] = date + else: + message['Date'] = email.utils.formatdate(date) + message['From'] = email.headerregistry.Address( + self.sender_name, addr_spec=self.sender_email) + message['To'] = email.headerregistry.Address(addr_spec=self.recipient_email) + message['Subject'] = self.subject() + message.set_content(super().render(self._template_name)) + return message + class CommentReport(Report): diff --git a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt b/dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt similarity index 83% rename from dev/archery/archery/crossbow/tests/fixtures/email-report.txt rename to dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt index c29cafd3938..5e7b8e9c67d 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/nightly-email-report.txt @@ -1,6 +1,11 @@ +MIME-Version: 1.0 +Message-Id: +Date: date From: Sender Reporter To: recipient@arrow.com Subject: [NIGHTLY] Arrow Build Report for Job ursabot-1: 2 failed, 1 pending +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit Arrow Build Report for Job ursabot-1 diff --git a/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt b/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt new file mode 100644 index 00000000000..1f8ccbf30c6 --- /dev/null +++ b/dev/archery/archery/crossbow/tests/fixtures/token-expiration-email-report.txt @@ -0,0 +1,14 @@ +MIME-Version: 1.0 +Message-Id: +Date: date +From: Sender Reporter +To: recipient@arrow.com +Subject: [CI] Arrow Crossbow Token Expiration in 2026-01-17 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +The Arrow Crossbow Token will expire in 7 days. + +Please generate a new Token. Send it to Apache INFRA to update the +CROSSBOW_GITHUB_TOKEN. Update it on the crossbow repository and in +the Azure pipelines. diff --git a/dev/archery/archery/crossbow/tests/test_reports.py b/dev/archery/archery/crossbow/tests/test_reports.py index 620b4c78bbc..02012d2f1be 100644 --- a/dev/archery/archery/crossbow/tests/test_reports.py +++ b/dev/archery/archery/crossbow/tests/test_reports.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. +import re import textwrap +from archery.crossbow.cli import (NightlyEmailReport, TokenExpirationEmailReport) from archery.crossbow.core import yaml -from archery.crossbow.reports import (ChatReport, CommentReport, EmailReport, - Report) +from archery.crossbow.reports import (ChatReport, CommentReport, Report) def test_crossbow_comment_formatter(load_fixture): @@ -71,19 +72,55 @@ def test_crossbow_chat_report_extra_message_success(load_fixture): assert report_chat.render("text") == textwrap.dedent(expected_msg) -def test_crossbow_email_report(load_fixture): - expected_msg = load_fixture('email-report.txt') +def test_crossbow_nightly_email_report(load_fixture): + expected_msg = load_fixture('nightly-email-report.txt') job = load_fixture('crossbow-job.yaml', decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - email_report = EmailReport(report=report, sender_name="Sender Reporter", - sender_email="sender@arrow.com", - recipient_email="recipient@arrow.com") + email_report = NightlyEmailReport( + report=report, + sender_name='Sender Reporter', + sender_email='sender@arrow.com', + recipient_email='recipient@arrow.com' + ) - assert ( - email_report.render("nightly_report") == textwrap.dedent(expected_msg) + actual = str(email_report.render()) + # Normalize dynamic headers + actual = re.sub(r'(?m)^Message-Id: <.+?>', + 'Message-Id: ', + actual) + actual = re.sub(r'(?m)^Date: [^\n]+ -0000$', + 'Date: date', + actual) + assert actual == textwrap.dedent(expected_msg) + + +def test_crossbow_token_expiration_email_report(load_fixture): + expected_msg = load_fixture('token-expiration-email-report.txt') + + class TokenExpirationReport: + def __init__(self, token_expiration_date, days_left): + self.token_expiration_date = token_expiration_date + self.days_left = days_left + + report = TokenExpirationReport('2026-01-17', 7) + email_report = TokenExpirationEmailReport( + report=report, + sender_name='Sender Reporter', + sender_email='sender@arrow.com', + recipient_email='recipient@arrow.com' ) + actual = str(email_report.render()) + # Normalize dynamic headers + actual = re.sub(r'(?m)^Message-Id: <.+?>', + 'Message-Id: ', + actual) + actual = re.sub(r'(?m)^Date: [^\n]+ -0000$', + 'Date: date', + actual) + assert actual == textwrap.dedent(expected_msg) + def test_crossbow_export_report(load_fixture): job = load_fixture('crossbow-job.yaml', decoder=yaml.load) diff --git a/dev/archery/archery/templates/email_nightly_report.txt.j2 b/dev/archery/archery/templates/email_nightly_report.txt.j2 index bc040734b03..7b43d7c867e 100644 --- a/dev/archery/archery/templates/email_nightly_report.txt.j2 +++ b/dev/archery/archery/templates/email_nightly_report.txt.j2 @@ -15,13 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -{%- if True -%} -{%- endif -%} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [NIGHTLY] Arrow Build Report for Job {{report.job.branch}}: {{ (report.tasks_by_state["error"] | length) + (report.tasks_by_state["failure"] | length) }} failed, {{ report.tasks_by_state["pending"] | length }} pending - +-#} Arrow Build Report for Job {{ report.job.branch }} See https://s3.amazonaws.com/arrow-data/index.html for more information. @@ -58,4 +52,4 @@ Succeeded Tasks: - {{ task_name }} {{ report.task_url(task) }} {% endfor %} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/dev/archery/archery/templates/email_token_expiration.txt.j2 b/dev/archery/archery/templates/email_token_expiration.txt.j2 index 54c2005e57e..340cb4a5353 100644 --- a/dev/archery/archery/templates/email_token_expiration.txt.j2 +++ b/dev/archery/archery/templates/email_token_expiration.txt.j2 @@ -15,12 +15,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [CI] Arrow Crossbow Token Expiration in {{ report.token_expiration_date }} - +-#} The Arrow Crossbow Token will expire in {{ report.days_left }} days. -Please generate a new Token. Send it to Apache INFRA to update the CROSSBOW_GITHUB_TOKEN. -Update it on the crossbow repository and in the Azure pipelines. +Please generate a new Token. Send it to Apache INFRA to update the +CROSSBOW_GITHUB_TOKEN. Update it on the crossbow repository and in +the Azure pipelines. diff --git a/dev/archery/archery/templates/email_workflow_report.txt.j2 b/dev/archery/archery/templates/email_workflow_report.txt.j2 index 193856c1806..6668d6c67ee 100644 --- a/dev/archery/archery/templates/email_workflow_report.txt.j2 +++ b/dev/archery/archery/templates/email_workflow_report.txt.j2 @@ -15,13 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#} -{%- if True -%} -{%- endif -%} -From: {{ sender_name }} <{{ sender_email }}> -To: {{ recipient_email }} -Subject: [{{ report.datetime.strftime('%Y-%m-%d') }}] Arrow Build Report for {{ report.name }}: {{ report.failed_jobs() | length }} failed - +-#} Arrow Build Report for {{ report.name }} Workflow URL: {{ report.url }} @@ -42,4 +36,4 @@ Succeeded Jobs: - {{ job.name }} {{ job.url }} {% endfor %} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/dev/release/08-publish-gh-release.sh b/dev/release/07-publish-gh-release.sh similarity index 100% rename from dev/release/08-publish-gh-release.sh rename to dev/release/07-publish-gh-release.sh diff --git a/dev/release/07-binary-verify.sh b/dev/release/08-binary-verify.sh similarity index 100% rename from dev/release/07-binary-verify.sh rename to dev/release/08-binary-verify.sh diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index 73a0b156075..ea8be3da15e 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -162,11 +162,19 @@ if [ "${cmake_version_major}" -gt "3" ] || \ [ "${cmake_version_major}" -eq "3" -a "${cmake_version_minor}" -ge "25" ]; then cp -a "${TOP_SOURCE_DIR}/cpp/examples/minimal_build" build/ pushd build/minimal_build - cmake . - make -j$(nproc) - ./arrow-example - c++ -o arrow-example example.cc $(pkg-config --cflags --libs arrow) -std=c++20 - ./arrow-example + cmake -S . -B build_shared + make -C build_shared -j$(nproc) + build_shared/arrow-example + cmake -S . -B build_static -DARROW_LINK_SHARED=OFF + make -C build_static -j$(nproc) + build_static/arrow-example + mkdir -p build_pkg_config + c++ \ + example.cc \ + -o build_pkg_config/arrow-example \ + $(pkg-config --cflags --libs arrow) \ + -std=c++20 + build_pkg_config/arrow-example popd fi echo "::endgroup::" diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh index d642f806295..684b2166934 100755 --- a/dev/release/verify-yum.sh +++ b/dev/release/verify-yum.sh @@ -44,7 +44,7 @@ repository_version="${distribution_version}" cmake_package=cmake cmake_command=cmake -devtoolset= +gcc_toolset= scl_package= have_arrow_libs=no have_flight=yes @@ -65,11 +65,17 @@ echo "::group::Prepare repository" case "${distribution}-${distribution_version}" in almalinux-8) distribution_prefix="almalinux" + gcc_toolset=14 have_arrow_libs=yes ruby_devel_packages+=(redhat-rpm-config) install_command="dnf install -y --enablerepo=powertools" info_command="dnf info --enablerepo=powertools" ;; + almalinux-9) + distribution_prefix="almalinux" + gcc_toolset=12 + ruby_devel_packages+=(redhat-rpm-config) + ;; almalinux-*) distribution_prefix="almalinux" ruby_devel_packages+=(redhat-rpm-config) @@ -169,11 +175,11 @@ ${install_command} \ git \ libarchive \ pkg-config -if [ -n "${devtoolset}" ]; then +if [ -n "${gcc_toolset}" ]; then ${install_command} \ - devtoolset-${devtoolset}-gcc-c++ \ - devtoolset-${devtoolset}-make - . /opt/rh/devtoolset-${devtoolset}/enable + gcc-toolset-${gcc_toolset} \ + make + . /opt/rh/gcc-toolset-${gcc_toolset}/enable else ${install_command} \ gcc-c++ \ @@ -191,13 +197,25 @@ if [ "${cmake_version_major}" -gt "3" ] || \ [ "${cmake_version_major}" -eq "3" -a "${cmake_version_minor}" -ge "25" ]; then cp -a "${TOP_SOURCE_DIR}/cpp/examples/minimal_build" build/ pushd build/minimal_build - ${cmake_command} . - make -j$(nproc) - ./arrow-example - c++ -o arrow-example example.cc $(pkg-config --cflags --libs arrow) -std=c++2a - ./arrow-example + cmake -S . -B build_shared + make -C build_shared -j$(nproc) + build_shared/arrow-example + cmake -S . -B build_static -DARROW_LINK_SHARED=OFF + make -C build_static -j$(nproc) + build_static/arrow-example + mkdir -p build_pkg_config + c++ \ + example.cc \ + -o build_pkg_config/arrow-example \ + $(pkg-config --cflags --libs arrow) \ + -std=c++2a + build_pkg_config/arrow-example popd fi +if [ -n "${gcc_toolset}" ]; then + dnf remove -y "gcc-toolset-${gcc_toolset}-*" + ${install_command} gcc-c++ +fi echo "::endgroup::" if [ "${have_glib}" = "yes" ]; then diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml deleted file mode 100644 index e65ac457b2e..00000000000 --- a/dev/tasks/docker-tests/github.cuda.yml +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - test: - name: | - Docker Test {{ flags|default("") }} {{ image }} {{ command|default("") }} - runs-on: ['self-hosted', 'cuda'] -{{ macros.github_set_env(env) }} - timeout-minutes: {{ timeout|default(60) }} - steps: - {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }} - # python 3.10 is installed on the runner, no need to install - - name: Install pip - run: sudo apt update && sudo apt install python3-pip -y - - name: Install archery - run: python3 -m pip install -e arrow/dev/archery[docker] - - name: Execute Docker Build - shell: bash - env: - {{ macros.github_set_sccache_envvars()|indent(8) }} - run: | - source arrow/ci/scripts/util_enable_core_dumps.sh - archery docker run \ - -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ - {{ flags|default("") }} \ - {{ image }} \ - {{ command|default("") }} - {% if arrow.is_default_branch() %} - {{ macros.github_login_dockerhub()|indent }} - - name: Push Docker Image - shell: bash - run: archery docker push {{ image }} - {% endif %} diff --git a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb index 5993f696566..71737d86453 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb @@ -29,7 +29,7 @@ class ApacheArrowGlib < Formula desc "GLib bindings for Apache Arrow" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0-SNAPSHOT/apache-arrow-23.0.0-SNAPSHOT.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.1/apache-arrow-23.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index f93a56f7f23..e14c0473a6c 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -29,7 +29,7 @@ class ApacheArrow < Formula desc "Columnar in-memory analytics layer designed to accelerate big data" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.0-SNAPSHOT/apache-arrow-23.0.0-SNAPSHOT.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-23.0.1/apache-arrow-23.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index f65042f2875..23155047455 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,15 @@ +apache-arrow-apt-source (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:01 -0000 + +apache-arrow-apt-source (23.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 13 Jan 2026 13:05:28 -0000 + apache-arrow-apt-source (22.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index b5796afa5e4..50f67825367 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -85,6 +85,12 @@ else fi %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + +* Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 +- New upstream release. + * Mon Oct 20 2025 Raúl Cumplido - 22.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/Rakefile b/dev/tasks/linux-packages/apache-arrow/Rakefile index 7644d2d23fb..cdb77108452 100644 --- a/dev/tasks/linux-packages/apache-arrow/Rakefile +++ b/dev/tasks/linux-packages/apache-arrow/Rakefile @@ -59,11 +59,15 @@ class ApacheArrowPackageTask < PackageTask end def download_rc_archive - base_url = "https://github.com/#{github_repository}" - base_url += "/releases/download/apache-arrow-#{@version}" archive_name_no_rc = @archive_name.gsub(/-rc\d+(\.tar\.gz)\z/, "\\1") - url = "#{base_url}/#{archive_name_no_rc}" - download(url, @archive_name) + sh("gh", + "release", + "download", + "apache-arrow-#{@version}", + "--clobber", + "--repo", github_repository, + "--pattern", archive_name_no_rc) + mv(archive_name_no_rc, @archive_name) end def download_released_archive diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 3239216a63e..8fae632bbc9 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,15 @@ +apache-arrow (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:01 -0000 + +apache-arrow (23.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 13 Jan 2026 13:05:28 -0000 + apache-arrow (22.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 8cc272c35ae..894b56d5244 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -877,6 +877,12 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + +* Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 +- New upstream release. + * Mon Oct 20 2025 Raúl Cumplido - 22.0.0-1 - New upstream release. diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 266073daff6..97843d2ef0c 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -41,9 +41,6 @@ groups: {############################# Testing tasks #################################} - cuda: - - test-cuda-* - test: - test-* @@ -454,7 +451,7 @@ tasks: flags: -e CC=gcc-14 -e CXX=g++-14 -e RapidJSON_SOURCE=BUNDLED image: ubuntu-cpp -{% for debian_version in ["12"] %} +{% for debian_version in ["13"] %} test-debian-{{ debian_version }}-cpp-amd64: ci: github template: docker-tests/github.linux.yml @@ -592,23 +589,25 @@ tasks: UBUNTU: 22.04 image: ubuntu-python-313-freethreading - test-debian-12-python-3-amd64: +{% for debian_version in ["13"] %} + test-debian-{{ debian_version }}-python-3-amd64: ci: github template: docker-tests/github.linux.yml params: env: - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" image: debian-python - test-debian-12-python-3-i386: + test-debian-{{ debian_version }}-python-3-i386: ci: github template: docker-tests/github.linux.yml params: env: ARCH: i386 - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" flags: "-e ARROW_S3=OFF -e ARROW_GANDIVA=OFF" image: debian-python +{% endfor %} test-ubuntu-22.04-python-3: ci: github @@ -661,6 +660,15 @@ tasks: params: image: alpine-linux-r + test-r-fedora-clang: + ci: github + template: docker-tests/github.linux.yml + params: + image: fedora-r-clang + # R-devel built from source with Clang, simulating CRAN's + # r-devel-linux-x86_64-fedora-clang environment + timeout: 180 # 3 hours - R-devel build from source takes time + test-r-macos-as-cran: ci: github template: r/github.macos.cran.yml @@ -750,7 +758,7 @@ tasks: template: r/github.macos.m1san.yml # be sure to update binary-task.rb when upgrading Debian - test-debian-12-docs: + test-debian-13-docs: ci: github template: docs/github.linux.yml params: @@ -762,35 +770,12 @@ tasks: artifacts: - docs.tar.gz - ############################## CUDA tests ################################# - -{% for ubuntu, cuda in [("22.04", "11.7.1"), ("24.04", "13.0.2")] %} - test-cuda-cpp-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}: - ci: github - template: docker-tests/github.cuda.yml - params: - env: - CUDA: {{ cuda }} - UBUNTU: {{ ubuntu }} - image: ubuntu-cuda-cpp - - test-cuda-python-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}: - ci: github - template: docker-tests/github.cuda.yml - params: - env: - CUDA: {{ cuda }} - UBUNTU: {{ ubuntu }} - image: ubuntu-cuda-python -{% endfor %} - ############################## Fuzz tests ################################# test-build-cpp-fuzz: ci: github template: fuzz-tests/github.oss-fuzz.yml - ############################## vcpkg tests ################################## test-build-vcpkg-win: diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index 6feaa86e1a7..4a4d2c948c0 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -1,15 +1,20 @@ [ { - "name": "23.0 (dev)", + "name": "24.0 (dev)", "version": "dev/", "url": "https://arrow.apache.org/docs/dev/" }, { - "name": "22.0 (stable)", + "name": "23.0 (stable)", "version": "", "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "22.0", + "version": "22.0/", + "url": "https://arrow.apache.org/docs/22.0/" + }, { "name": "21.0", "version": "21.0/", diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index c5efc5f30fc..0ec81c1e6c8 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -246,7 +246,8 @@ Build source and binaries and submit them archery crossbow status # Download the produced binaries - # This will download packages to a directory called packages/release--rc + # This will download packages generated from the archery tasks + # to a directory called packages/release--rc dev/release/04-binary-download.sh # Sign and upload the binaries @@ -263,11 +264,14 @@ Build source and binaries and submit them # NOTE: You need to have GitHub CLI installed to run this script. dev/release/06-matlab-upload.sh + # Move the Release Candidate GitHub Release from draft to published state + # This will update the artifacts download URL which will be available for the + # verification step. + dev/release/07-publish-gh-release.sh + # Start verifications for binaries and wheels - dev/release/07-binary-verify.sh + dev/release/08-binary-verify.sh - # Move the Release Candidate GitHub Release from draft to published state - dev/release/08-publish-gh-release.sh Verify the Release ------------------ diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 9fa737f687a..7c9cd3b5017 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -100,7 +100,7 @@ endfunction() set(CMAKE_CXX_STANDARD 20) -set(MLARROW_VERSION "23.0.0-SNAPSHOT") +set(MLARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") project(mlarrow VERSION "${MLARROW_BASE_VERSION}") diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b1c8e324942..bf71387bcd1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -28,7 +28,7 @@ project(pyarrow) # which in turn meant that Py_GIL_DISABLED was not set. set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) -set(PYARROW_VERSION "23.0.0-SNAPSHOT") +set(PYARROW_VERSION "23.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" PYARROW_BASE_VERSION "${PYARROW_VERSION}") # Generate SO version and full SO version diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index b7e7af260c2..f447129cf40 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -1666,7 +1666,7 @@ cdef class FlightClient(_Weakrefable): result = Result.__new__(Result) with nogil: check_flight_status(results.get().Next().Value(&result.result)) - if result.result == NULL: + if result.result == nullptr: break yield result return _do_action_response() @@ -1695,7 +1695,7 @@ cdef class FlightClient(_Weakrefable): result = FlightInfo.__new__(FlightInfo) with nogil: check_flight_status(listing.get().Next().Value(&result.info)) - if result.info == NULL: + if result.info == nullptr: break yield result diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..d8bdea76413 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -6316,8 +6316,8 @@ def concat_tables(tables, MemoryPool memory_pool=None, str promote_options="none "default" if promote_options == "none" else promote_options ) + options.unify_schemas = promote_options != "none" with nogil: - options.unify_schemas = promote_options != "none" c_result_table = GetResultValue( ConcatenateTables(c_tables, options, pool)) diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 5878d1f9026..a95826e1c00 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -231,8 +231,9 @@ def _break_traceback_cycle_from_frame(frame): def _download_urllib(url, out_path): - from urllib.request import urlopen - with urlopen(url) as response: + from urllib.request import urlopen, Request + req = Request(url, headers={'User-Agent': 'pyarrow'}) + with urlopen(req) as response: with open(out_path, 'wb') as f: f.write(response.read()) @@ -264,11 +265,13 @@ def download_tzdata_on_windows(): # Try to download the files with requests and then fall back to urllib. This # works around possible issues in certain older environment (GH-45295) try: - _download_requests(tzdata_url, tzdata_compressed_path) - _download_requests(windows_zones_url, windows_zones_path) + import requests # noqa: F401 + download_fn = _download_requests except ImportError: - _download_urllib(tzdata_url, tzdata_compressed_path) - _download_urllib(windows_zones_url, windows_zones_path) + download_fn = _download_urllib + + download_fn(tzdata_url, tzdata_compressed_path) + download_fn(windows_zones_url, windows_zones_path) assert os.path.exists(tzdata_compressed_path) assert os.path.exists(windows_zones_path) diff --git a/python/pyproject.toml b/python/pyproject.toml index 0a730fd4f78..45c52cc0c4d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -95,4 +95,4 @@ root = '..' version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' -fallback_version = '23.0.0a0' +fallback_version = '23.0.1' diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index ac6388762b4..769435f4dd8 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,3 +1,4 @@ +build cython>=3.1 numpy>=2.0.0 setuptools_scm diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 86ca441263e..a21d2daacd1 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 22.0.0.9000 +Version: 23.0.1 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), @@ -28,7 +28,7 @@ URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/ BugReports: https://github.com/apache/arrow/issues Encoding: UTF-8 Language: en-US -SystemRequirements: C++17; for AWS S3 support on Linux, libcurl and openssl (optional); +SystemRequirements: C++20; for AWS S3 support on Linux, libcurl and openssl (optional); cmake >= 3.26 (build-time only, and only for full source build) Biarch: true Imports: diff --git a/r/NEWS.md b/r/NEWS.md index e9f7a591ced..abfafffb2e2 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,7 @@ under the License. --> -# arrow 22.0.0.9000 +# arrow 23.0.1 # arrow 22.0.0.1 ## Minor improvements and fixes diff --git a/r/README.md b/r/README.md index 1ab9206f119..bb5d137dc88 100644 --- a/r/README.md +++ b/r/README.md @@ -44,7 +44,7 @@ There are some special cases to note: - On Linux the installation process can sometimes be more involved because CRAN does not host binaries for Linux. For more information please see the [installation guide](https://arrow.apache.org/docs/r/articles/install.html). -- If you are compiling arrow from source, please note that as of version 10.0.0, arrow requires C++17 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.0 or later. On CentOS 7, it means you need to install a newer compiler than the default system compiler gcc. See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. +- If you are compiling arrow from source, please note that as of version 23.0.0, arrow requires C++20 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.3 or later (though R 4.2 has incomplete support and might work with special configuration). See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. - Development versions of arrow are released nightly. For information on how to install nightly builds please see the [installing nightly builds](https://arrow.apache.org/docs/r/articles/install_nightly.html) article. diff --git a/r/configure b/r/configure index f64a3673f97..9e92eb6b47f 100755 --- a/r/configure +++ b/r/configure @@ -86,10 +86,10 @@ if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then ${R_HOME}/bin/Rscript data-raw/codegen.R fi -# Arrow requires C++17, so check for it -if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then +# Arrow requires C++20, so check for it +if [ ! "`${R_HOME}/bin/R CMD config CXX20`" ]; then echo "------------------------- NOTE ---------------------------" - echo "Cannot install arrow: a C++17 compiler is required." + echo "Cannot install arrow: a C++20 compiler is required." echo "See https://arrow.apache.org/docs/r/articles/install.html" echo "---------------------------------------------------------" exit 1 @@ -260,14 +260,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs @@ -408,11 +400,11 @@ else fi # Test that we can compile something with those flags -CXX17="`${R_HOME}/bin/R CMD config CXX17` -E" -CXX17FLAGS=`"${R_HOME}"/bin/R CMD config CXX17FLAGS` -CXX17STD=`"${R_HOME}"/bin/R CMD config CXX17STD` +CXX20="`${R_HOME}/bin/R CMD config CXX20` -E" +CXX20FLAGS=`"${R_HOME}"/bin/R CMD config CXX20FLAGS` +CXX20STD=`"${R_HOME}"/bin/R CMD config CXX20STD` CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS` -TEST_CMD="${CXX17} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX17FLAGS} ${CXX17STD} -xc++ -" +TEST_CMD="${CXX20} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX20FLAGS} ${CXX20STD} -xc++ -" TEST_ERROR=$(echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} -o /dev/null 2>&1) if [ $? -eq 0 ]; then diff --git a/r/configure.win b/r/configure.win index 433ef28439a..16c5ec1bee8 100755 --- a/r/configure.win +++ b/r/configure.win @@ -117,14 +117,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index c90d4ae2164..e9fdd50a347 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,8 @@ -

22.0.0.9000 (dev)

-

22.0.0 (release)

+

23.0.1.9000 (dev)

+

23.0.1 (release)

+

22.0.0

21.0.0

20.0.0

19.0.1

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 0d783995062..7d22213ef3b 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,12 +1,16 @@ [ { - "name": "22.0.0.9000 (dev)", + "name": "23.0.1.9000 (dev)", "version": "dev/" }, { - "name": "22.0.0 (release)", + "name": "23.0.1 (release)", "version": "" }, + { + "name": "22.0.0", + "version": "22.0/" + }, { "name": "21.0.0", "version": "21.0/" diff --git a/r/src/Makevars.in b/r/src/Makevars.in index af0826faacb..1b7ad08e1cb 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -25,7 +25,7 @@ PKG_CPPFLAGS=@cflags@ # https://bugs.llvm.org/show_bug.cgi?id=39191 # https://www.mail-archive.com/gcc-bugs@gcc.gnu.org/msg534862.html # PKG_CXXFLAGS=$(CXX_VISIBILITY) -CXX_STD=CXX17 +CXX_STD=CXX20 PKG_LIBS=@libs@ all: $(SHLIB) purify diff --git a/r/src/Makevars.ucrt b/r/src/Makevars.ucrt index a91dedc2d55..b72ed64d98e 100644 --- a/r/src/Makevars.ucrt +++ b/r/src/Makevars.ucrt @@ -19,4 +19,4 @@ CRT=-ucrt include Makevars.win # XXX for some reason, this variable doesn't seem propagated from Makevars.win -CXX_STD=CXX17 +CXX_STD=CXX20 diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0777ca8bc72..c8aa903bf06 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -162,12 +162,13 @@ std::shared_ptr make_compute_options( // false means descending, true means ascending // cpp11 does not support bool here so use int auto orders = cpp11::as_cpp>(options["orders"]); - std::vector keys; + // Use resize + assignment to avoid vector growth operations that trigger + // false positive -Wmaybe-uninitialized warnings in GCC 14 with std::variant + std::vector keys(names.size(), Key("", Order::Ascending)); for (size_t i = 0; i < names.size(); i++) { - keys.push_back( - Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending)); + keys[i] = Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending); } - auto out = std::make_shared(Options(keys)); + auto out = std::make_shared(std::move(keys)); return out; } diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 9d0a2604682..f4ccb4956a8 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -310,11 +310,11 @@ compile_test_program <- function(code) { openssl_dir <- paste0("-I", openssl_root_dir, "/include") } runner <- paste( - R_CMD_config("CXX17"), + R_CMD_config("CXX20"), openssl_dir, R_CMD_config("CPPFLAGS"), - R_CMD_config("CXX17FLAGS"), - R_CMD_config("CXX17STD"), + R_CMD_config("CXX20FLAGS"), + R_CMD_config("CXX20STD"), "-E", "-xc++" ) @@ -565,8 +565,11 @@ build_libarrow <- function(src_dir, dst_dir) { # is found, it will be used by the libarrow build, and this does # not affect how R compiles the arrow bindings. CC = sub("^.*ccache", "", R_CMD_config("CC")), - CXX = paste(sub("^.*ccache", "", R_CMD_config("CXX17")), R_CMD_config("CXX17STD")), - # CXXFLAGS = R_CMD_config("CXX17FLAGS"), # We don't want the same debug symbols + CXX = paste( + sub("^.*ccache", "", R_CMD_config("CXX20")), + R_CMD_config("CXX20STD") + ), + # CXXFLAGS = R_CMD_config("CXX20FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS"), N_JOBS = ncores ) diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 69780bd64df..d9cdcc3885c 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -23,8 +23,8 @@ but there are a few things to note. ### Compilers -As of version 10.0.0, arrow requires a C++17 compiler to build. -For `gcc`, this generally means version 7 or newer. Most contemporary Linux +As of version 22.0.0, arrow requires a C++20 compiler to build. +For `gcc`, this generally means version 10 or newer. Most contemporary Linux distributions have a new enough compiler; however, CentOS 7 is a notable exception, as it ships with gcc 4.8. diff --git a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index 6cd19389f46..c18ab9ac467 100644 --- a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowCUDA - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb index 46d7339fb33..f2333c67c63 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowDataset - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb index 25ed5f2bb35..891db90d732 100644 --- a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb +++ b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlightSQL - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/ruby/red-arrow-flight/lib/arrow-flight/version.rb index 6961134c6c8..25063a59d99 100644 --- a/ruby/red-arrow-flight/lib/arrow-flight/version.rb +++ b/ruby/red-arrow-flight/lib/arrow-flight/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlight - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-format/NOTICE.txt b/ruby/red-arrow-format/NOTICE.txt index 1b218efe168..8916e188af1 100644 --- a/ruby/red-arrow-format/NOTICE.txt +++ b/ruby/red-arrow-format/NOTICE.txt @@ -1,2 +1,2 @@ Apache Arrow -Copyright 2025 The Apache Software Foundation +Copyright 2025-2026 The Apache Software Foundation diff --git a/ruby/red-arrow-format/lib/arrow-format/version.rb b/ruby/red-arrow-format/lib/arrow-format/version.rb index 389bd4dc5ea..6fccd13e71b 100644 --- a/ruby/red-arrow-format/lib/arrow-format/version.rb +++ b/ruby/red-arrow-format/lib/arrow-format/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFormat - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp index 404ec8996f2..0c582d07077 100644 --- a/ruby/red-arrow/ext/arrow/arrow.cpp +++ b/ruby/red-arrow/ext/arrow/arrow.cpp @@ -59,7 +59,7 @@ namespace red_arrow { { auto plan = GARROW_EXECUTE_PLAN(object); auto nodes = garrow_execute_plan_get_nodes(plan); - for (auto node = nodes; nodes; nodes = g_list_next(nodes)) { + for (auto node = nodes; node; node = g_list_next(node)) { rbgobj_gc_mark_instance(node->data); } } diff --git a/ruby/red-arrow/lib/arrow/version.rb b/ruby/red-arrow/lib/arrow/version.rb index 4e8bf057f52..9a94c971989 100644 --- a/ruby/red-arrow/lib/arrow/version.rb +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -16,7 +16,7 @@ # under the License. module Arrow - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb index 5b409db58fe..f958522a08f 100644 --- a/ruby/red-gandiva/lib/gandiva/version.rb +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -16,7 +16,7 @@ # under the License. module Gandiva - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb index 6e4c1cd95ab..325c045a8f7 100644 --- a/ruby/red-parquet/lib/parquet/version.rb +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -16,7 +16,7 @@ # under the License. module Parquet - VERSION = "23.0.0-SNAPSHOT" + VERSION = "23.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/testing b/testing index 19dda67f485..df428ddaa22 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 19dda67f485ffb3ffa92f4c6fa083576ef052d58 +Subproject commit df428ddaa22d94dfb525af4c0951f3dafb463795