From cb72219c63060815167f5bc0496eec173581306e Mon Sep 17 00:00:00 2001 From: "xuan.zhao" Date: Tue, 10 Mar 2026 16:47:19 +0800 Subject: [PATCH] feat(puffin): add basic data structures and constants Add the foundational types for Puffin file format support: - Blob, BlobMetadata, FileMetadata structs - PuffinCompressionCodec enum with codec name conversion - StandardBlobTypes and StandardPuffinProperties constants - ToString functions for all types - 24 unit tests covering all public APIs --- src/iceberg/CMakeLists.txt | 2 + src/iceberg/meson.build | 2 + src/iceberg/puffin/CMakeLists.txt | 18 +++++ src/iceberg/puffin/file_metadata.cc | 118 +++++++++++++++++++++++++++ src/iceberg/puffin/file_metadata.h | 120 ++++++++++++++++++++++++++++ src/iceberg/puffin/meson.build | 18 +++++ 6 files changed, 278 insertions(+) create mode 100644 src/iceberg/puffin/CMakeLists.txt create mode 100644 src/iceberg/puffin/file_metadata.cc create mode 100644 src/iceberg/puffin/file_metadata.h create mode 100644 src/iceberg/puffin/meson.build diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 21e87bee4..d8e5abc8d 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -62,6 +62,7 @@ set(ICEBERG_SOURCES partition_field.cc partition_spec.cc partition_summary.cc + puffin/file_metadata.cc row/arrow_array_wrapper.cc row/manifest_wrapper.cc row/partition_values.cc @@ -166,6 +167,7 @@ add_subdirectory(catalog) add_subdirectory(data) add_subdirectory(expression) add_subdirectory(manifest) +add_subdirectory(puffin) add_subdirectory(row) add_subdirectory(update) add_subdirectory(util) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index bfc502fd8..e63011cb1 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -80,6 +80,7 @@ iceberg_sources = files( 'partition_field.cc', 'partition_spec.cc', 'partition_summary.cc', + 'puffin/file_metadata.cc', 'row/arrow_array_wrapper.cc', 'row/manifest_wrapper.cc', 'row/partition_values.cc', @@ -221,6 +222,7 @@ install_headers( subdir('catalog') subdir('expression') subdir('manifest') +subdir('puffin') subdir('row') subdir('update') subdir('util') diff --git a/src/iceberg/puffin/CMakeLists.txt b/src/iceberg/puffin/CMakeLists.txt new file mode 100644 index 000000000..087ea09cb --- /dev/null +++ b/src/iceberg/puffin/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +iceberg_install_all_headers(iceberg/puffin) diff --git a/src/iceberg/puffin/file_metadata.cc b/src/iceberg/puffin/file_metadata.cc new file mode 100644 index 000000000..748329fcf --- /dev/null +++ b/src/iceberg/puffin/file_metadata.cc @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/file_metadata.h" + +#include +#include + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg::puffin { + +namespace { +constexpr std::string_view kLz4CodecName = "lz4"; +constexpr std::string_view kZstdCodecName = "zstd"; +} // namespace + +std::string_view CodecName(PuffinCompressionCodec codec) { + switch (codec) { + case PuffinCompressionCodec::kNone: + return ""; + case PuffinCompressionCodec::kLz4: + return kLz4CodecName; + case PuffinCompressionCodec::kZstd: + return kZstdCodecName; + } + std::unreachable(); +} + +Result PuffinCompressionCodecFromName( + std::string_view codec_name) { + if (codec_name.empty()) { + return PuffinCompressionCodec::kNone; + } + if (codec_name == kLz4CodecName) { + return PuffinCompressionCodec::kLz4; + } + if (codec_name == kZstdCodecName) { + return PuffinCompressionCodec::kZstd; + } + return InvalidArgument("Unknown codec name: {}", codec_name); +} + +std::string ToString(PuffinCompressionCodec codec) { + return std::string(CodecName(codec)); +} + +std::string ToString(const Blob& blob) { + std::string repr = "Blob["; + std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type, + blob.input_fields); + std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", + blob.snapshot_id, blob.sequence_number); + std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size()); + if (blob.requested_compression.has_value()) { + std::format_to(std::back_inserter(repr), ",requestedCompression={}", + ToString(*blob.requested_compression)); + } + if (!blob.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", blob.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +std::string ToString(const BlobMetadata& blob_metadata) { + std::string repr = "BlobMetadata["; + std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", + blob_metadata.type, blob_metadata.input_fields); + std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", + blob_metadata.snapshot_id, blob_metadata.sequence_number); + std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset, + blob_metadata.length); + if (!blob_metadata.compression_codec.empty()) { + std::format_to(std::back_inserter(repr), ",compressionCodec='{}'", + blob_metadata.compression_codec); + } + if (!blob_metadata.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +std::string ToString(const FileMetadata& file_metadata) { + std::string repr = "FileMetadata["; + std::format_to(std::back_inserter(repr), "blobs=["); + for (size_t i = 0; i < file_metadata.blobs.size(); ++i) { + if (i > 0) { + std::format_to(std::back_inserter(repr), ","); + } + std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i])); + } + std::format_to(std::back_inserter(repr), "]"); + if (!file_metadata.properties.empty()) { + std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties); + } + std::format_to(std::back_inserter(repr), "]"); + return repr; +} + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/file_metadata.h b/src/iceberg/puffin/file_metadata.h new file mode 100644 index 000000000..17ddad77d --- /dev/null +++ b/src/iceberg/puffin/file_metadata.h @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/file_metadata.h +/// Data structures for Puffin files. + +#include +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg::puffin { + +/// \brief Compression codecs supported by Puffin files. +enum class PuffinCompressionCodec { + kNone, + kLz4, + kZstd, +}; + +ICEBERG_EXPORT std::string_view CodecName(PuffinCompressionCodec codec); + +ICEBERG_EXPORT Result PuffinCompressionCodecFromName( + std::string_view codec_name); + +ICEBERG_EXPORT std::string ToString(PuffinCompressionCodec codec); + +/// \brief Standard blob types defined by the Iceberg specification. +struct StandardBlobTypes { + /// A serialized form of a "compact" Theta sketch produced by the + /// Apache DataSketches library. + static constexpr std::string_view kApacheDatasketchesThetaV1 = + "apache-datasketches-theta-v1"; + + /// A serialized deletion vector according to the Iceberg spec. + static constexpr std::string_view kDeletionVectorV1 = "deletion-vector-v1"; +}; + +/// \brief Standard file-level properties for Puffin files. +struct StandardPuffinProperties { + /// Human-readable identification of the application writing the file, + /// along with its version. + static constexpr std::string_view kCreatedBy = "created-by"; +}; + +/// \brief A blob in a Puffin file. +struct ICEBERG_EXPORT Blob { + /// See StandardBlobTypes for known types. + std::string type; + /// Ordered list of field IDs the blob was computed from. + std::vector input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + std::vector data; + /// If not set, the writer's default codec will be used. + std::optional requested_compression; + std::unordered_map properties; + + friend bool operator==(const Blob& lhs, const Blob& rhs) = default; +}; + +ICEBERG_EXPORT std::string ToString(const Blob& blob); + +/// \brief Metadata about a blob stored in a Puffin file footer. +struct ICEBERG_EXPORT BlobMetadata { + /// See StandardBlobTypes for known types. + std::string type; + /// Ordered list of field IDs the blob was computed from. + std::vector input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + int64_t offset; + int64_t length; + /// Codec name (e.g. "lz4", "zstd"), or empty if not compressed. + std::string compression_codec; + std::unordered_map properties; + + friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default; +}; + +ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata); + +/// \brief Metadata about a Puffin file. +struct ICEBERG_EXPORT FileMetadata { + std::vector blobs; + std::unordered_map properties; + + friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default; +}; + +ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata); + +} // namespace iceberg::puffin diff --git a/src/iceberg/puffin/meson.build b/src/iceberg/puffin/meson.build new file mode 100644 index 000000000..0655156eb --- /dev/null +++ b/src/iceberg/puffin/meson.build @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['file_metadata.h'], subdir: 'iceberg/puffin')