From 401993cd6aeeb1dc0d9b3559fc82f7ffcba51d5e Mon Sep 17 00:00:00 2001 From: zhengyu Date: Wed, 5 Nov 2025 22:11:41 +0800 Subject: [PATCH 01/20] [enhancement](filecache) use rocksdb to persist cache block meta (#57072) --- be/src/common/config.cpp | 2 + be/src/io/cache/block_file_cache.cpp | 34 +- be/src/io/cache/block_file_cache.h | 2 + be/src/io/cache/cache_block_meta_store.cpp | 582 ++++++++++++ be/src/io/cache/cache_block_meta_store.h | 142 +++ be/src/io/cache/cached_remote_file_reader.cpp | 3 + be/src/io/cache/file_block.cpp | 42 +- be/src/io/cache/file_block.h | 9 +- be/src/io/cache/file_cache_common.cpp | 121 ++- be/src/io/cache/file_cache_common.h | 11 +- be/src/io/cache/file_cache_storage.h | 9 +- be/src/io/cache/fs_file_cache_storage.cpp | 741 +++++++++------ be/src/io/cache/fs_file_cache_storage.h | 40 +- be/src/io/cache/mem_file_cache_storage.cpp | 9 +- be/src/io/cache/mem_file_cache_storage.h | 8 +- be/src/io/fs/hdfs_file_writer.cpp | 3 +- be/src/io/fs/local_file_system.cpp | 1 + be/src/io/fs/s3_file_writer.cpp | 10 +- .../olap/rowset/segment_v2/segment_writer.cpp | 5 +- be/test/io/cache/block_file_cache_test.cpp | 227 ++++- .../block_file_cache_test_meta_store.cpp | 421 +++++++++ .../io/cache/cache_block_meta_store_test.cpp | 860 ++++++++++++++++++ gensrc/proto/file_cache.proto | 14 + 23 files changed, 2879 insertions(+), 417 deletions(-) create mode 100644 be/src/io/cache/cache_block_meta_store.cpp create mode 100644 be/src/io/cache/cache_block_meta_store.h create mode 100644 be/test/io/cache/block_file_cache_test_meta_store.cpp create mode 100644 be/test/io/cache/cache_block_meta_store_test.cpp diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index ff9c11a970f6b9..5862e68a738471 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1141,6 +1141,8 @@ DEFINE_mInt64(cache_lock_held_long_tail_threshold_us, "30000000"); DEFINE_mBool(enable_file_cache_keep_base_compaction_output, "false"); DEFINE_mBool(enable_file_cache_adaptive_write, "true"); DEFINE_mDouble(file_cache_keep_base_compaction_output_min_hit_ratio, "0.7"); +// if difference below this threshold, we consider cache's progressive upgrading (2.0->3.0) successful +DEFINE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold, "0.3"); // if difference below this threshold, we consider cache's progressive upgrading (2.0->3.0) successful DEFINE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold, "0.3"); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index eb26660713a49e..c7bbb9067f8752 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -324,6 +324,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _need_evict_cache_in_advance_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_need_evict_cache_in_advance", 0); + _meta_store_write_queue_size_metrics = std::make_shared>( + _cache_base_path.c_str(), "file_cache_meta_store_write_queue_size", 0); _cache_lock_wait_time_us = std::make_shared( _cache_base_path.c_str(), "file_cache_cache_lock_wait_time_us"); @@ -544,6 +546,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte key.hash = hash; key.meta.type = context.cache_type; key.meta.expiration_time = context.expiration_time; + key.meta.tablet_id = context.tablet_id; _storage->load_blocks_directly_unlocked(this, key, cache_lock); it = _files.find(hash); @@ -573,7 +576,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte FileCacheType origin_type = cell.file_block->cache_type(); if (origin_type == FileCacheType::TTL) continue; - st = cell.file_block->change_cache_type_between_ttl_and_others(FileCacheType::TTL); + st = cell.file_block->change_cache_type_lock(FileCacheType::TTL, cache_lock); if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); @@ -617,8 +620,8 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte for (auto& [_, cell] : file_blocks) { auto cache_type = cell.file_block->cache_type(); if (cache_type != FileCacheType::TTL) continue; - auto st = cell.file_block->change_cache_type_between_ttl_and_others( - FileCacheType::NORMAL); + auto st = + cell.file_block->change_cache_type_lock(FileCacheType::NORMAL, cache_lock); if (st.ok()) { if (cell.queue_iterator) { auto& ttl_queue = get_queue(FileCacheType::TTL); @@ -783,6 +786,7 @@ FileBlocks BlockFileCache::split_range_into_cells(const UInt128Wrapper& hash, key.offset = current_pos; key.meta.type = context.cache_type; key.meta.expiration_time = context.expiration_time; + key.meta.tablet_id = context.tablet_id; auto file_block = std::make_shared(key, current_size, this, FileBlock::State::SKIP_CACHE); file_blocks.push_back(std::move(file_block)); @@ -945,12 +949,13 @@ FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& hash, const CacheC key.offset = offset; key.meta.type = context.cache_type; key.meta.expiration_time = context.expiration_time; + key.meta.tablet_id = context.tablet_id; FileBlockCell cell(std::make_shared(key, size, this, state), cache_lock); Status st; if (context.expiration_time == 0 && context.cache_type == FileCacheType::TTL) { - st = cell.file_block->change_cache_type_between_ttl_and_others(FileCacheType::NORMAL); + st = cell.file_block->change_cache_type_lock(FileCacheType::NORMAL, cache_lock); } else if (context.cache_type != FileCacheType::TTL && context.expiration_time != 0) { - st = cell.file_block->change_cache_type_between_ttl_and_others(FileCacheType::TTL); + st = cell.file_block->change_cache_type_lock(FileCacheType::TTL, cache_lock); } if (!st.ok()) { LOG(WARNING) << "Cannot change cache type. expiration_time=" << context.expiration_time @@ -1241,8 +1246,7 @@ bool BlockFileCache::remove_if_ttl_file_blocks(const UInt128Wrapper& file_key, b } if (cell.file_block->cache_type() == FileCacheType::NORMAL) continue; - st = cell.file_block->change_cache_type_between_ttl_and_others( - FileCacheType::NORMAL); + st = cell.file_block->change_cache_type_lock(FileCacheType::NORMAL, cache_lock); if (st.ok()) { if (cell.queue_iterator) { ttl_queue.remove(cell.queue_iterator.value(), cache_lock); @@ -1541,6 +1545,7 @@ void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lo auto offset = file_block->offset(); auto type = file_block->cache_type(); auto expiration_time = file_block->expiration_time(); + auto tablet_id = file_block->tablet_id(); auto* cell = get_cell(hash, offset, cache_lock); file_block->cell = nullptr; DCHECK(cell); @@ -1561,6 +1566,7 @@ void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lo key.offset = offset; key.meta.type = type; key.meta.expiration_time = expiration_time; + key.meta.tablet_id = tablet_id; if (sync) { int64_t duration_ns = 0; Status st; @@ -2046,6 +2052,18 @@ void BlockFileCache::run_background_monitor() { _cur_disposable_queue_element_count_metrics->set_value( _disposable_queue.get_elements_num(cache_lock)); + // Update meta store write queue size if storage is FSFileCacheStorage + if (_storage->get_type() == FileCacheStorageType::DISK) { + auto* fs_storage = dynamic_cast(_storage.get()); + if (fs_storage != nullptr) { + auto* meta_store = fs_storage->get_meta_store(); + if (meta_store != nullptr) { + _meta_store_write_queue_size_metrics->set_value( + meta_store->get_write_queue_size()); + } + } + } + if (_num_read_blocks->get_value() > 0) { _hit_ratio->set_value((double)_num_hit_blocks->get_value() / (double)_num_read_blocks->get_value()); @@ -2256,7 +2274,7 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, FileCacheType origin_type = cell.file_block->cache_type(); if (origin_type == FileCacheType::TTL) continue; - st = cell.file_block->change_cache_type_between_ttl_and_others(FileCacheType::TTL); + st = cell.file_block->change_cache_type_lock(FileCacheType::TTL, cache_lock); if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 7527186c749480..ef1df34b26beaa 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -145,6 +145,7 @@ struct FileBlockCell { size_t size() const { return file_block->_block_range.size(); } + FileBlockCell() = default; FileBlockCell(FileBlockSPtr file_block, std::lock_guard& cache_lock); FileBlockCell(FileBlockCell&& other) noexcept : file_block(std::move(other.file_block)), @@ -595,6 +596,7 @@ class BlockFileCache { std::shared_ptr> _no_warmup_hit_ratio_1h; std::shared_ptr> _disk_limit_mode_metrics; std::shared_ptr> _need_evict_cache_in_advance_metrics; + std::shared_ptr> _meta_store_write_queue_size_metrics; std::shared_ptr _cache_lock_wait_time_us; std::shared_ptr _get_or_set_latency_us; diff --git a/be/src/io/cache/cache_block_meta_store.cpp b/be/src/io/cache/cache_block_meta_store.cpp new file mode 100644 index 00000000000000..433bee7a275152 --- /dev/null +++ b/be/src/io/cache/cache_block_meta_store.cpp @@ -0,0 +1,582 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/cache_block_meta_store.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "olap/field.h" +#include "olap/field.h" // For OLAP_FIELD_TYPE_BIGINT +#include "olap/key_coder.h" +#include "olap/olap_common.h" +#include "util/threadpool.h" +#include "vec/common/hex.h" + +namespace doris::io { + +const std::string FILE_CACHE_META_COLUMN_FAMILY = "file_cache_meta"; + +// bvar metrics for rocksdb operation failures +bvar::Adder g_rocksdb_write_failed_num("file_cache_meta_rocksdb_write_failed_num"); +bvar::Adder g_rocksdb_delete_failed_num("file_cache_meta_rocksdb_delete_failed_num"); + +CacheBlockMetaStore::CacheBlockMetaStore(const std::string& db_path, size_t queue_size) + : _db_path(db_path), _write_queue(queue_size) { + auto status = init(); + if (!status.ok()) { + LOG(ERROR) << "Failed to initialize CacheBlockMetaStore: " << status.to_string(); + } +} + +CacheBlockMetaStore::~CacheBlockMetaStore() { + _stop_worker.store(true, std::memory_order_release); + if (_write_thread.joinable()) { + _write_thread.join(); + } + + if (_db) { + if (_file_cache_meta_cf_handle) { + _db->DestroyColumnFamilyHandle(_file_cache_meta_cf_handle.release()); + } + _db->Close(); + } +} + +size_t CacheBlockMetaStore::get_write_queue_size() const { + return _write_queue.size_approx(); +} + +Status CacheBlockMetaStore::init() { + std::filesystem::create_directories(_db_path); + + _options.create_if_missing = true; + _options.create_missing_column_families = true; + _options.error_if_exists = false; + _options.compression = rocksdb::kNoCompression; + _options.max_open_files = 1000; + _options.write_buffer_size = 64 * 1024 * 1024; // 64MB + _options.target_file_size_base = 64 * 1024 * 1024; + + rocksdb::BlockBasedTableOptions table_options; + table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + table_options.block_size = 16 * 1024; + _options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); + + // Create column family descriptors + std::vector column_families; + // Default column family is required + column_families.emplace_back(rocksdb::kDefaultColumnFamilyName, rocksdb::ColumnFamilyOptions()); + // File cache meta column family + column_families.emplace_back(FILE_CACHE_META_COLUMN_FAMILY, rocksdb::ColumnFamilyOptions()); + + std::vector handles; + rocksdb::DB* db_ptr = nullptr; + rocksdb::Status status = + rocksdb::DB::Open(_options, _db_path, column_families, &handles, &db_ptr); + + if (!status.ok()) { + LOG(WARNING) << "Failed to open rocksdb: " << status.ToString() + << "Database path: " << _db_path; + return Status::InternalError("Failed to open rocksdb: {}", status.ToString()); + } + _db.reset(db_ptr); + + // Store the file_cache_meta column family handle + // handles[0] is default column family, handles[1] is file_cache_meta + if (handles.size() >= 2) { + _file_cache_meta_cf_handle.reset(handles[1]); + // Close default column family handle as we won't use it + _db->DestroyColumnFamilyHandle(handles[0]); + } else { + return Status::InternalError("Failed to get file_cache_meta column family handle"); + } + + _write_thread = std::thread(&CacheBlockMetaStore::async_write_worker, this); + + return Status::OK(); +} + +void CacheBlockMetaStore::put(const BlockMetaKey& key, const BlockMeta& meta) { + std::string key_str = serialize_key(key); + std::string value_str = serialize_value(meta); + + // Put write task into queue for asynchronous processing + WriteOperation op; + op.type = OperationType::PUT; + op.key = key_str; + op.value = value_str; + _write_queue.enqueue(op); +} + +std::optional CacheBlockMetaStore::get(const BlockMetaKey& key) { + // we trade accurate for clean code. so we ignore pending operations in the write queue + // only use data in rocksdb + std::string key_str = serialize_key(key); + std::string value_str; + rocksdb::Status status; + + if (!_db) { + LOG(WARNING) << "Database not initialized, cannot get key"; + return std::nullopt; + } + status = + _db->Get(rocksdb::ReadOptions(), _file_cache_meta_cf_handle.get(), key_str, &value_str); + + if (status.ok()) { + Status deserialize_status; + auto result = deserialize_value(value_str, &deserialize_status); + if (result.has_value()) { + return result; + } else { + LOG(WARNING) << "Failed to deserialize value: " << deserialize_status.to_string(); + return std::nullopt; + } + } else if (status.IsNotFound()) { + return std::nullopt; + } else { + LOG(WARNING) << "Failed to get key from rocksdb: " << status.ToString(); + return std::nullopt; + } +} + +std::unique_ptr CacheBlockMetaStore::range_get(int64_t tablet_id) { + // Generate prefix using new serialization format + std::string prefix; + prefix.push_back(0x1); // version byte + auto* tablet_id_coder = get_key_coder(FieldType::OLAP_FIELD_TYPE_BIGINT); + tablet_id_coder->full_encode_ascending(&tablet_id, &prefix); + + class RocksDBIterator : public BlockMetaIterator { + public: + RocksDBIterator(rocksdb::Iterator* iter, const std::string& prefix) + : _iter(iter), + _prefix(prefix), + _last_key_error(Status::OK()), + _last_value_error(Status::OK()) { + _iter->Seek(_prefix); + } + + ~RocksDBIterator() override { delete _iter; } + + bool valid() const override { + if (!_iter->Valid()) return false; + Slice key_slice(_iter->key().data(), _prefix.size()); + return key_slice.compare(Slice(_prefix)) == 0; + } + + void next() override { _iter->Next(); } + + BlockMetaKey key() const override { + // Reset error state + _last_key_error = Status::OK(); + + auto key_view = std::string_view(_iter->key().data(), _iter->key().size()); + Status status; + auto result = deserialize_key(std::string(key_view), &status); + + if (!result.has_value()) { + _last_key_error = status; + LOG(WARNING) << "Failed to deserialize key in range_get: " << status.to_string(); + // error indicator, caller should check get_last_key_error + return BlockMetaKey(-1, UInt128Wrapper(0), 0); + } + + return result.value(); + } + + BlockMeta value() const override { + // Reset error state + _last_value_error = Status::OK(); + + auto value_view = std::string_view(_iter->value().data(), _iter->value().size()); + Status status; + auto result = deserialize_value(value_view, &status); + + if (!result.has_value()) { + _last_value_error = status; + LOG(WARNING) << "Failed to deserialize value in range_get: " << status.to_string(); + // error indicator, caller should check get_last_value_error + return BlockMeta(FileCacheType::DISPOSABLE, 0, 0); + } + + VLOG_DEBUG << "RocksDB value: " << value_view + << ", deserialized as: type=" << result->type << ", size=" << result->size + << ", ttl=" << result->ttl; + return result.value(); + } + + Status get_last_key_error() const override { return _last_key_error; } + Status get_last_value_error() const override { return _last_value_error; } + + private: + rocksdb::Iterator* _iter; + std::string _prefix; + mutable Status _last_key_error; + mutable Status _last_value_error; + }; + + if (!_db) { + LOG(WARNING) << "Database not initialized, cannot create iterator"; + return nullptr; + } + rocksdb::Iterator* iter = + _db->NewIterator(rocksdb::ReadOptions(), _file_cache_meta_cf_handle.get()); + return std::unique_ptr(new RocksDBIterator(iter, prefix)); +} + +std::unique_ptr CacheBlockMetaStore::get_all() { + if (!_db) { + LOG(WARNING) << "Database not initialized in get_all()"; + return nullptr; + } + + class RocksDBIterator : public BlockMetaIterator { + public: + RocksDBIterator(rocksdb::Iterator* iter) + : _iter(iter), _last_key_error(Status::OK()), _last_value_error(Status::OK()) { + _iter->SeekToFirst(); + } + + ~RocksDBIterator() override { delete _iter; } + + bool valid() const override { return _iter->Valid(); } + + void next() override { _iter->Next(); } + + BlockMetaKey key() const override { + // Reset error state + _last_key_error = Status::OK(); + + auto key_view = std::string_view(_iter->key().data(), _iter->key().size()); + Status status; + auto result = deserialize_key(std::string(key_view), &status); + + if (!result.has_value()) { + _last_key_error = status; + LOG(WARNING) << "Failed to deserialize key in get_all: " << status.to_string(); + // 返回一个无效的键作为错误指示,调用方应该检查错误状态 + return BlockMetaKey(-1, UInt128Wrapper(0), 0); // 使用无效值作为错误指示 + } + + return result.value(); + } + + BlockMeta value() const override { + // Reset error state + _last_value_error = Status::OK(); + + auto value_view = std::string_view(_iter->value().data(), _iter->value().size()); + Status status; + auto result = deserialize_value(value_view, &status); + + if (!result.has_value()) { + _last_value_error = status; + LOG(WARNING) << "Failed to deserialize value in get_all: " << status.to_string(); + // error indicator, caller should check get_last_value_error + return BlockMeta(FileCacheType::DISPOSABLE, 0, 0); + } + + VLOG_DEBUG << "RocksDB value: " << value_view + << ", deserialized as: type=" << result->type << ", size=" << result->size + << ", ttl=" << result->ttl; + return result.value(); + } + + Status get_last_key_error() const override { return _last_key_error; } + Status get_last_value_error() const override { return _last_value_error; } + + private: + rocksdb::Iterator* _iter; + mutable Status _last_key_error; + mutable Status _last_value_error; + }; + + rocksdb::Iterator* iter = + _db->NewIterator(rocksdb::ReadOptions(), _file_cache_meta_cf_handle.get()); + if (!iter) { + LOG(WARNING) << "Failed to create rocksdb iterator in get_all()"; + return nullptr; + } + return std::unique_ptr(new RocksDBIterator(iter)); +} + +void CacheBlockMetaStore::delete_key(const BlockMetaKey& key) { + std::string key_str = serialize_key(key); + + // Put delete task into queue for asynchronous processing + WriteOperation op; + op.type = OperationType::DELETE; + op.key = key_str; + _write_queue.enqueue(op); +} + +void CacheBlockMetaStore::clear() { + // First, stop the async worker thread + _stop_worker.store(true, std::memory_order_release); + if (_write_thread.joinable()) { + _write_thread.join(); + } + + // Clear the write queue to remove any pending operations + WriteOperation op; + while (_write_queue.try_dequeue(op)) { + // Just discard all pending operations + } + + // Delete all records from rocksdb + + if (_db) { + // Use DeleteRange to delete all keys + rocksdb::Slice start = ""; + rocksdb::Slice end = "\xff\xff\xff\xff"; // Maximum byte sequence + rocksdb::Status status = _db->DeleteRange(rocksdb::WriteOptions(), + _file_cache_meta_cf_handle.get(), start, end); + if (!status.ok()) { + LOG(WARNING) << "Failed to delete range from rocksdb: " << status.ToString(); + } + } + + // Restart the async worker thread + _stop_worker.store(false, std::memory_order_release); + _write_thread = std::thread(&CacheBlockMetaStore::async_write_worker, this); +} + +void CacheBlockMetaStore::async_write_worker() { + Thread::set_self_name("cache_block_meta_store_async_write_worker"); + while (!_stop_worker.load(std::memory_order_acquire)) { + WriteOperation op; + + if (_write_queue.try_dequeue(op)) { + rocksdb::Status status; + + if (!_db) { + LOG(WARNING) << "Database not initialized, skipping operation"; + continue; + } + + if (op.type == OperationType::PUT) { + status = _db->Put(rocksdb::WriteOptions(), _file_cache_meta_cf_handle.get(), op.key, + op.value); + } else if (op.type == OperationType::DELETE) { + status = _db->Delete(rocksdb::WriteOptions(), _file_cache_meta_cf_handle.get(), + op.key); + } + + if (!status.ok()) { + LOG(WARNING) << "Failed to " << (op.type == OperationType::PUT ? "write" : "delete") + << " to rocksdb: " << status.ToString(); + if (op.type == OperationType::PUT) { + g_rocksdb_write_failed_num << 1; + } else { + g_rocksdb_delete_failed_num << 1; + } + } + } else { + // Queue is empty, sleep briefly + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + } + + // Process remaining tasks in the queue + WriteOperation op; + while (_write_queue.try_dequeue(op)) { + rocksdb::Status status; + + if (!_db) { + LOG(WARNING) << "Database not initialized, skipping operation"; + continue; + } + + if (op.type == OperationType::PUT) { + status = _db->Put(rocksdb::WriteOptions(), _file_cache_meta_cf_handle.get(), op.key, + op.value); + } else if (op.type == OperationType::DELETE) { + status = _db->Delete(rocksdb::WriteOptions(), _file_cache_meta_cf_handle.get(), op.key); + } + + if (!status.ok()) { + LOG(WARNING) << "Failed to " << (op.type == OperationType::PUT ? "write" : "delete") + << " to rocksdb: " << status.ToString(); + } + } +} + +std::string serialize_key(const BlockMetaKey& key) { + std::string result; + // Add version byte + result.push_back(0x1); + + // Encode tablet_id using KeyCoderTraits + auto* tablet_id_coder = get_key_coder(FieldType::OLAP_FIELD_TYPE_BIGINT); + tablet_id_coder->full_encode_ascending(&key.tablet_id, &result); + + // Encode hash high and low parts + uint64_t hash_high = key.hash.high(); + uint64_t hash_low = key.hash.low(); + tablet_id_coder->full_encode_ascending(&hash_high, &result); + tablet_id_coder->full_encode_ascending(&hash_low, &result); + + // Encode offset + tablet_id_coder->full_encode_ascending(&key.offset, &result); + + return result; +} + +std::string serialize_value(const BlockMeta& meta) { + doris::io::cache::BlockMetaPb pb; + pb.set_type(static_cast<::doris::io::cache::FileCacheType>(meta.type)); + pb.set_size(meta.size); + pb.set_ttl(meta.ttl); + + std::string result; + pb.SerializeToString(&result); + return result; +} + +std::optional deserialize_key(const std::string& key_str, Status* status) { + // New key format: [version][encoded tablet_id][encoded hash_high][encoded hash_low][encoded offset] + Slice slice(key_str); + + // Check version byte + if (slice.size < 1 || slice.data[0] != 0x1) { + LOG(WARNING) << "Invalid key, expected prefix 0x1"; + if (status) *status = Status::InternalError("Failed to decode key: invalid version"); + return std::nullopt; // Invalid version + } + slice.remove_prefix(1); // skip version byte + + auto* tablet_id_coder = get_key_coder(FieldType::OLAP_FIELD_TYPE_BIGINT); + int64_t tablet_id; + uint64_t hash_high, hash_low; + size_t offset; + + Status st = tablet_id_coder->decode_ascending(&slice, sizeof(int64_t), + reinterpret_cast(&tablet_id)); + if (!st.ok()) { + if (status) + *status = Status::InternalError("Failed to decode tablet_id: {}", st.to_string()); + return std::nullopt; + } + + st = tablet_id_coder->decode_ascending(&slice, sizeof(uint64_t), + reinterpret_cast(&hash_high)); + if (!st.ok()) { + if (status) + *status = Status::InternalError("Failed to decode hash_high: {}", st.to_string()); + return std::nullopt; + } + + st = tablet_id_coder->decode_ascending(&slice, sizeof(uint64_t), + reinterpret_cast(&hash_low)); + if (!st.ok()) { + if (status) + *status = Status::InternalError("Failed to decode hash_low: {}", st.to_string()); + return std::nullopt; + } + + st = tablet_id_coder->decode_ascending(&slice, sizeof(size_t), + reinterpret_cast(&offset)); + if (!st.ok()) { + if (status) *status = Status::InternalError("Failed to decode offset: {}", st.to_string()); + return std::nullopt; + } + + uint128_t hash = (static_cast(hash_high) << 64) | hash_low; + if (status) *status = Status::OK(); + return BlockMetaKey(tablet_id, UInt128Wrapper(hash), offset); +} + +std::optional deserialize_value(const std::string& value_str, Status* status) { + if (value_str.empty()) { + if (status) *status = Status::InternalError("Failed to deserialize value"); + return std::nullopt; + } + + // Parse as protobuf format + doris::io::cache::BlockMetaPb pb; + if (pb.ParseFromString(value_str)) { + // Validate the parsed protobuf data + int type = pb.type(); + if (type < 0 || type > 3) { // Valid FileCacheType values: 0-3 + LOG(WARNING) << "Invalid FileCacheType value: " << type; + if (status) + *status = Status::InternalError("Failed to deserialize value: invalid type"); + return std::nullopt; + } + if (pb.size() <= 0) { + LOG(WARNING) << "Invalid size value: " << pb.size(); + if (status) + *status = Status::InternalError("Failed to deserialize value: invalid size"); + return std::nullopt; + } + + if (status) *status = Status::OK(); + return BlockMeta(static_cast(pb.type()), pb.size(), pb.ttl()); + } + + LOG(WARNING) << "Failed to deserialize value as protobuf: " << value_str; + if (status) *status = Status::InternalError("Failed to deserialize value"); + return std::nullopt; +} + +std::optional deserialize_value(std::string_view value_view, Status* status) { + if (value_view.empty()) { + if (status) *status = Status::InternalError("Failed to deserialize value"); + return std::nullopt; + } + + // Parse as protobuf format using string_view + doris::io::cache::BlockMetaPb pb; + if (pb.ParseFromArray(value_view.data(), static_cast(value_view.size()))) { + // Validate the parsed protobuf data + int type = pb.type(); + if (type < 0 || type > 3) { // Valid FileCacheType values: 0-3 + LOG(WARNING) << "Invalid FileCacheType value: " << type; + if (status) + *status = Status::InternalError("Failed to deserialize value: invalid type"); + return std::nullopt; + } + if (pb.size() <= 0) { + LOG(WARNING) << "Invalid size value: " << pb.size(); + if (status) + *status = Status::InternalError("Failed to deserialize value: invalid size"); + return std::nullopt; + } + + if (status) *status = Status::OK(); + return BlockMeta(static_cast(pb.type()), pb.size(), pb.ttl()); + } + + LOG(WARNING) << "Failed to deserialize value as protobuf from string_view"; + if (status) *status = Status::InternalError("Failed to deserialize value"); + return std::nullopt; +} + +} // namespace doris::io \ No newline at end of file diff --git a/be/src/io/cache/cache_block_meta_store.h b/be/src/io/cache/cache_block_meta_store.h new file mode 100644 index 00000000000000..9412e58f4fee7f --- /dev/null +++ b/be/src/io/cache/cache_block_meta_store.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gen_cpp/file_cache.pb.h" +#include "io/cache/file_cache_common.h" +#include "util/threadpool.h" + +namespace doris::io { + +struct BlockMeta { + FileCacheType type; + size_t size; + uint64_t ttl; + + BlockMeta() : type(DISPOSABLE), size(0), ttl(0) {} + BlockMeta(FileCacheType type_, size_t size_) : type(type_), size(size_), ttl(0) {} + BlockMeta(FileCacheType type_, size_t size_, uint64_t ttl_) + : type(type_), size(size_), ttl(ttl_) {} + + bool operator==(const BlockMeta& other) const { + return type == other.type && size == other.size && ttl == other.ttl; + } +}; + +struct BlockMetaKey { + int64_t tablet_id; + UInt128Wrapper hash; + size_t offset; + + BlockMetaKey() : tablet_id(0), hash(UInt128Wrapper(0)), offset(0) {} + BlockMetaKey(int64_t tablet_id_, UInt128Wrapper hash_, size_t offset_) + : tablet_id(tablet_id_), hash(hash_), offset(offset_) {} + + bool operator==(const BlockMetaKey& other) const { + return tablet_id == other.tablet_id && hash == other.hash && offset == other.offset; + } + + std::string to_string() const { + return std::to_string(tablet_id) + "_" + hash.to_string() + "_" + std::to_string(offset); + } +}; + +class BlockMetaIterator { +public: + virtual ~BlockMetaIterator() = default; + virtual bool valid() const = 0; + virtual void next() = 0; + virtual BlockMetaKey key() const = 0; + virtual BlockMeta value() const = 0; + + // Error status query methods + virtual Status get_last_key_error() const { return Status::OK(); } + virtual Status get_last_value_error() const { return Status::OK(); } +}; + +class CacheBlockMetaStore { +public: + CacheBlockMetaStore(const std::string& db_path, size_t queue_size = 1000); + ~CacheBlockMetaStore(); + + Status init(); + + // Asynchronously write BlockMeta to rocksdb + void put(const BlockMetaKey& key, const BlockMeta& meta); + + // Synchronously get BlockMeta + std::optional get(const BlockMetaKey& key); + + // Range query all BlockMeta for specified tablet_id + std::unique_ptr range_get(int64_t tablet_id); + + // Get iterator for all BlockMeta records + std::unique_ptr get_all(); + + // Asynchronously delete specified BlockMeta + void delete_key(const BlockMetaKey& key); + + // Clear all records from rocksdb and the async queue + void clear(); + + // Get the approximate size of the write queue + size_t get_write_queue_size() const; + +private: + void async_write_worker(); + + std::string _db_path; + std::unique_ptr _db; + rocksdb::Options _options; + std::unique_ptr _file_cache_meta_cf_handle; + + enum class OperationType { PUT, DELETE }; + struct WriteOperation { + OperationType type; + std::string key; + std::string value; // Only used for PUT operations + }; + moodycamel::ConcurrentQueue _write_queue; + std::atomic _stop_worker {false}; + std::thread _write_thread; + std::mutex _queue_mutex; + + std::unique_ptr _thread_pool; +}; + +std::string serialize_key(const BlockMetaKey& key); +std::string serialize_value(const BlockMeta& meta); +std::optional deserialize_key(const std::string& key_str, Status* status = nullptr); +std::optional deserialize_value(const std::string& value_str, Status* status = nullptr); +std::optional deserialize_value(std::string_view value_view, Status* status = nullptr); + +} // namespace doris::io \ No newline at end of file diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 972ee3c35921ce..dc8fd795dde382 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -42,6 +42,7 @@ #include "io/cache/block_file_cache_factory.h" #include "io/cache/block_file_cache_profile.h" #include "io/cache/file_block.h" +#include "io/cache/file_cache_common.h" #include "io/cache/peer_file_cache_reader.h" #include "io/fs/file_reader.h" #include "io/fs/local_file_system.h" @@ -380,6 +381,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* s_align_size(offset + already_read, bytes_req - already_read, size()); CacheContext cache_context(io_ctx); cache_context.stats = &stats; + auto tablet_id = get_tablet_id(path().string()); + cache_context.tablet_id = tablet_id.value_or(0); MonotonicStopWatch sw; sw.start(); FileBlocksHolder holder = diff --git a/be/src/io/cache/file_block.cpp b/be/src/io/cache/file_block.cpp index 731c1d311e6680..fd5c3a478f1dae 100644 --- a/be/src/io/cache/file_block.cpp +++ b/be/src/io/cache/file_block.cpp @@ -114,7 +114,7 @@ void FileBlock::reset_downloader_impl(std::lock_guard& block_lock) { Status FileBlock::set_downloaded(std::lock_guard& /* block_lock */) { DCHECK(_download_state != State::DOWNLOADED); DCHECK_NE(_downloaded_size, 0); - Status status = _mgr->_storage->finalize(_key); + Status status = _mgr->_storage->finalize(_key, this->_block_range.size()); if (status.ok()) [[likely]] { _download_state = State::DOWNLOADED; } else { @@ -165,41 +165,22 @@ Status FileBlock::read(Slice buffer, size_t read_offset) { return _mgr->_storage->read(_key, read_offset, buffer); } -Status FileBlock::change_cache_type_between_ttl_and_others(FileCacheType new_type) { - std::lock_guard block_lock(_mutex); - DCHECK(new_type != _key.meta.type); - bool expr = (new_type == FileCacheType::TTL || _key.meta.type == FileCacheType::TTL); - if (!expr) { - LOG(WARNING) << "none of the cache type is TTL" - << ", hash: " << _key.hash.to_string() << ", offset: " << _key.offset - << ", new type: " << cache_type_to_string(new_type) - << ", old type: " << cache_type_to_string(_key.meta.type); - } - DCHECK(expr); - - // change cache type between TTL to others don't need to rename the filename suffix - _key.meta.type = new_type; - return Status::OK(); +Status FileBlock::change_cache_type(FileCacheType new_type) { + SCOPED_CACHE_LOCK(_mgr->_mutex, _mgr); + return change_cache_type_lock(new_type, cache_lock); } -Status FileBlock::change_cache_type_between_normal_and_index(FileCacheType new_type) { - SCOPED_CACHE_LOCK(_mgr->_mutex, _mgr); +Status FileBlock::change_cache_type_lock(FileCacheType new_type, + std::lock_guard& cache_lock) { std::lock_guard block_lock(_mutex); - bool expr = (new_type != FileCacheType::TTL && _key.meta.type != FileCacheType::TTL); - if (!expr) { - LOG(WARNING) << "one of the cache type is TTL" - << ", hash: " << _key.hash.to_string() << ", offset: " << _key.offset - << ", new type: " << cache_type_to_string(new_type) - << ", old type: " << cache_type_to_string(_key.meta.type); - } - DCHECK(expr); - if (_key.meta.type == FileCacheType::TTL || new_type == _key.meta.type) { + + if (new_type == _key.meta.type) { return Status::OK(); } if (_download_state == State::DOWNLOADED) { Status st; TEST_SYNC_POINT_CALLBACK("FileBlock::change_cache_type", &st); - RETURN_IF_ERROR(_mgr->_storage->change_key_meta_type(_key, new_type)); + RETURN_IF_ERROR(_mgr->_storage->change_key_meta_type(_key, new_type, _block_range.size())); } _mgr->change_cache_type(_key.hash, _block_range.left, new_type, cache_lock); _key.meta.type = new_type; @@ -209,8 +190,9 @@ Status FileBlock::change_cache_type_between_normal_and_index(FileCacheType new_t Status FileBlock::update_expiration_time(uint64_t expiration_time) { std::lock_guard block_lock(_mutex); if (_download_state == State::DOWNLOADED) { - auto st = _mgr->_storage->change_key_meta_expiration(_key, expiration_time); - if (!st.ok() && !st.is()) { + auto st = _mgr->_storage->change_key_meta_expiration(_key, expiration_time, + _block_range.size()); + if (!st.ok()) { return st; } } diff --git a/be/src/io/cache/file_block.h b/be/src/io/cache/file_block.h index 9cc4f06c6f0514..9ccb9a958707f2 100644 --- a/be/src/io/cache/file_block.h +++ b/be/src/io/cache/file_block.h @@ -114,13 +114,18 @@ class FileBlock { FileCacheType cache_type() const { return _key.meta.type; } + int64_t tablet_id() const { return _key.meta.tablet_id; } + + void set_tablet_id(int64_t id) { _key.meta.tablet_id = id; } + static uint64_t get_caller_id(); std::string get_info_for_log() const; - [[nodiscard]] Status change_cache_type_between_ttl_and_others(FileCacheType new_type); + [[nodiscard]] Status change_cache_type(FileCacheType new_type); - [[nodiscard]] Status change_cache_type_between_normal_and_index(FileCacheType new_type); + [[nodiscard]] Status change_cache_type_lock(FileCacheType new_type, + std::lock_guard&); [[nodiscard]] Status update_expiration_time(uint64_t expiration_time); diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 64faec3beb3b3e..85af8b330a190d 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -131,8 +131,8 @@ std::string UInt128Wrapper::to_string() const { return vectorized::get_hex_uint_lowercase(value_); } -FileBlocksHolderPtr FileCacheAllocatorBuilder::allocate_cache_holder(size_t offset, - size_t size) const { +FileBlocksHolderPtr FileCacheAllocatorBuilder::allocate_cache_holder(size_t offset, size_t size, + int64_t tablet_id) const { CacheContext ctx; ctx.cache_type = _expiration_time == 0 ? FileCacheType::NORMAL : FileCacheType::TTL; ctx.expiration_time = _expiration_time; @@ -146,4 +146,121 @@ FileBlocksHolderPtr FileCacheAllocatorBuilder::allocate_cache_holder(size_t offs template size_t LRUQueue::get_capacity(std::lock_guard& cache_lock) const; template void LRUQueue::remove(Iterator queue_it, std::lock_guard& cache_lock); +std::string FileCacheInfo::to_string() const { + std::stringstream ss; + ss << "Hash: " << hash.to_string() << "\n" + << "Expiration Time: " << expiration_time << "\n" + << "Offset: " << offset << "\n" + << "Cache Type: " << cache_type_to_string(cache_type) << "\n"; + return ss.str(); +} + +std::string InconsistencyType::to_string() const { + std::string result = "Inconsistency Reason: "; + if (type == NONE) { + result += "NONE"; + } else { + if (type & NOT_LOADED) { + result += "NOT_LOADED "; + } + if (type & MISSING_IN_STORAGE) { + result += "MISSING_IN_STORAGE "; + } + if (type & SIZE_INCONSISTENT) { + result += "SIZE_INCONSISTENT "; + } + if (type & CACHE_TYPE_INCONSISTENT) { + result += "CACHE_TYPE_INCONSISTENT "; + } + if (type & EXPIRATION_TIME_INCONSISTENT) { + result += "EXPIRATION_TIME_INCONSISTENT "; + } + if (type & TMP_FILE_EXPECT_DOWNLOADING_STATE) { + result += "TMP_FILE_EXPECT_DOWNLOADING_STATE"; + } + } + result += "\n"; + return result; +} + +std::optional get_tablet_id(std::string file_path) { + // Expected path formats: + // support both .dat and .idx file extensions + // support formate see ut. storage_resource_test:StorageResourceTest.ParseTabletIdFromPath + + if (file_path.empty()) { + return std::nullopt; + } + + // Find the position of "data/" in the path + std::string_view path_view = file_path; + std::string_view data_prefix = DATA_PREFIX; + size_t data_pos = path_view.find(data_prefix); + if (data_pos == std::string_view::npos) { + return std::nullopt; + } + + // Extract the part after "data/" + path_view = path_view.substr(data_pos + data_prefix.length() + 1); + + // Check if path ends with .dat or .idx + if (!path_view.ends_with(".dat") && !path_view.ends_with(".idx")) { + return std::nullopt; + } + + // Count slashes in the remaining path + size_t slash_count = 0; + for (char c : path_view) { + if (c == '/') { + slash_count++; + } + } + + // Split path by '/' + std::vector parts; + size_t start = 0; + size_t pos = 0; + while ((pos = path_view.find('/', start)) != std::string_view::npos) { + if (pos > start) { + parts.push_back(path_view.substr(start, pos - start)); + } + start = pos + 1; + } + if (start < path_view.length()) { + parts.push_back(path_view.substr(start)); + } + + if (parts.empty()) { + return std::nullopt; + } + + // Determine path version based on slash count and extract tablet_id + // Version 0: {tablet_id}/{rowset_id}_{seg_id}.dat (1 slash) + // Version 1: {shard}/{tablet_id}/{rowset_id}/{seg_id}.dat (3 slashes) + + if (slash_count == 1) { + // Version 0 format: parts[0] should be tablet_id + if (parts.size() >= 1) { + try { + int64_t tablet_id = std::stoll(std::string(parts[0])); + return tablet_id; + } catch (const std::exception&) { + // Not a valid number, return nullopt at last + } + } + } else if (slash_count == 3) { + // Version 1 format: parts[1] should be tablet_id (parts[0] is shard) + if (parts.size() >= 2) { + try { + int64_t tablet_id = std::stoll(std::string(parts[1])); + return tablet_id; + } catch (const std::exception&) { + // Not a valid number, return nullopt at last + } + } + } + + return std::nullopt; +} + } // namespace doris::io diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 090c219236b59d..9181c61c55d995 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -57,6 +57,11 @@ struct UInt128Wrapper { uint64_t high() const { return static_cast(value_ >> 64); } uint64_t low() const { return static_cast(value_); } + + friend std::ostream& operator<<(std::ostream& os, const UInt128Wrapper& wrapper) { + os << "UInt128Wrapper(" << wrapper.high() << ", " << wrapper.low() << ")"; + return os; + } }; struct ReadStatistics { @@ -86,7 +91,7 @@ struct FileCacheAllocatorBuilder { uint64_t _expiration_time; UInt128Wrapper _cache_hash; BlockFileCache* _cache; // Only one ref, the lifetime is owned by FileCache - FileBlocksHolderPtr allocate_cache_holder(size_t offset, size_t size) const; + FileBlocksHolderPtr allocate_cache_holder(size_t offset, size_t size, int64_t tablet_id) const; }; struct KeyHash { @@ -105,6 +110,7 @@ struct KeyAndOffsetHash { struct KeyMeta { uint64_t expiration_time; // absolute time FileCacheType type; + int64_t tablet_id {0}; }; struct FileCacheKey { @@ -164,6 +170,7 @@ struct CacheContext { bool is_cold_data {false}; ReadStatistics* stats; bool is_warmup {false}; + int64_t tablet_id {0}; }; template @@ -256,4 +263,6 @@ class LRUQueue { int64_t hot_data_interval {0}; }; +std::optional get_tablet_id(std::string file_path); + } // namespace doris::io diff --git a/be/src/io/cache/file_cache_storage.h b/be/src/io/cache/file_cache_storage.h index 024e701c6fa08b..e3c0c0c9aea583 100644 --- a/be/src/io/cache/file_cache_storage.h +++ b/be/src/io/cache/file_cache_storage.h @@ -50,15 +50,16 @@ class FileCacheStorage { // append datas into block virtual Status append(const FileCacheKey& key, const Slice& value) = 0; // finalize the block - virtual Status finalize(const FileCacheKey& key) = 0; + virtual Status finalize(const FileCacheKey& key, const size_t size) = 0; // read the block virtual Status read(const FileCacheKey& key, size_t value_offset, Slice result) = 0; // remove the block virtual Status remove(const FileCacheKey& key) = 0; // change the block meta - virtual Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type) = 0; - virtual Status change_key_meta_expiration(const FileCacheKey& key, - const uint64_t expiration) = 0; + virtual Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, + const size_t size) = 0; + virtual Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, + const size_t size) = 0; // use when lazy load cache virtual void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) {} diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 3df56973af7149..0556be80c15257 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -17,6 +17,12 @@ #include "io/cache/fs_file_cache_storage.h" +#include +#include +#include +#include +#include + #include #include #include @@ -106,30 +112,38 @@ Status FSFileCacheStorage::init(BlockFileCache* _mgr) { _iterator_dir_retry_cnt = std::make_shared( _cache_base_path.c_str(), "file_cache_fs_storage_iterator_dir_retry_cnt"); _cache_base_path = _mgr->_cache_base_path; + _meta_store = std::make_unique(_cache_base_path + "/meta", 10000); _cache_background_load_thread = std::thread([this, mgr = _mgr]() { - auto mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::OTHER, - fmt::format("FileCacheVersionReader")); - SCOPED_ATTACH_TASK(mem_tracker); - Status st = upgrade_cache_dir_if_necessary(); - if (!st.ok()) { - std::string msg = fmt::format( - "file cache {} upgrade done with error. upgrade version failed. st={}", - _cache_base_path, st.to_string()); - if (doris::config::ignore_file_cache_dir_upgrade_failure) { - LOG(WARNING) << msg << " be conf: `ignore_file_cache_dir_upgrade_failure = true`" - << " so we are ignoring the error (unsuccessful cache files will be " - "removed)"; - remove_old_version_directories(); - } else { - LOG(WARNING) << msg << " please fix error and restart BE or" - << " use be conf: `ignore_file_cache_dir_upgrade_failure = true`" - << " to skip the error (unsuccessful cache files will be removed)"; - throw doris::Exception(Status::InternalError(msg)); + try { + auto mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::OTHER, fmt::format("FileCacheVersionReader")); + SCOPED_ATTACH_TASK(mem_tracker); + Status st = upgrade_cache_dir_if_necessary(); + if (!st.ok()) { + std::string msg = fmt::format( + "file cache {} upgrade done with error. upgrade version failed. st={}", + _cache_base_path, st.to_string()); + if (doris::config::ignore_file_cache_dir_upgrade_failure) { + LOG(WARNING) + << msg << " be conf: `ignore_file_cache_dir_upgrade_failure = true`" + << " so we are ignoring the error (unsuccessful cache files will be " + "removed)"; + remove_old_version_directories(); + } else { + LOG(WARNING) << msg << " please fix error and restart BE or" + << " use be conf: `ignore_file_cache_dir_upgrade_failure = true`" + << " to skip the error (unsuccessful cache files will be removed)"; + throw doris::Exception(Status::InternalError(msg)); + } } + load_cache_info_into_memory(mgr); + mgr->_async_open_done = true; + LOG_INFO("file cache {} lazy load done.", _cache_base_path); + } catch (const std::exception& e) { + LOG(ERROR) << "Background cache loading thread failed with exception: " << e.what(); + } catch (...) { + LOG(ERROR) << "Background cache loading thread failed with unknown exception"; } - load_cache_info_into_memory(mgr); - mgr->_async_open_done = true; - LOG_INFO("file cache {} lazy load done.", _cache_base_path); }); return Status::OK(); } @@ -142,12 +156,12 @@ Status FSFileCacheStorage::append(const FileCacheKey& key, const Slice& value) { if (auto iter = _key_to_writer.find(file_writer_map_key); iter != _key_to_writer.end()) { writer = iter->second.get(); } else { - std::string dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); + std::string dir = get_path_in_local_cache_v3(key.hash); auto st = fs->create_directory(dir, false); if (!st.ok() && !st.is()) { return st; } - std::string tmp_file = get_path_in_local_cache(dir, key.offset, key.meta.type, true); + std::string tmp_file = get_path_in_local_cache_v3(dir, key.offset, true); FileWriterPtr file_writer; FileWriterOptions opts {.sync_file_data = false}; RETURN_IF_ERROR(fs->create_file(tmp_file, &file_writer, &opts)); @@ -159,7 +173,7 @@ Status FSFileCacheStorage::append(const FileCacheKey& key, const Slice& value) { return writer->append(value); } -Status FSFileCacheStorage::finalize(const FileCacheKey& key) { +Status FSFileCacheStorage::finalize(const FileCacheKey& key, const size_t size) { FileWriterPtr file_writer; { std::lock_guard lock(_mtx); @@ -172,9 +186,18 @@ Status FSFileCacheStorage::finalize(const FileCacheKey& key) { if (file_writer->state() != FileWriter::State::CLOSED) { RETURN_IF_ERROR(file_writer->close()); } - std::string dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); - std::string true_file = get_path_in_local_cache(dir, key.offset, key.meta.type); - return fs->rename(file_writer->path(), true_file); + std::string dir = get_path_in_local_cache_v3(key.hash); + std::string true_file = get_path_in_local_cache_v3(dir, key.offset); + auto s = fs->rename(file_writer->path(), true_file); + if (!s.ok()) { + return s; + } + + BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); + BlockMeta meta(key.meta.type, size, key.meta.expiration_time); + _meta_store->put(mkey, meta); + + return Status::OK(); } Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Slice buffer) { @@ -182,29 +205,24 @@ Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Sl FileReaderSPtr file_reader = FDCache::instance()->get_file_reader(fd_key); if (!file_reader) { std::string file = - get_path_in_local_cache(get_path_in_local_cache(key.hash, key.meta.expiration_time), - key.offset, key.meta.type); + get_path_in_local_cache_v3(get_path_in_local_cache_v3(key.hash), key.offset); Status s = fs->open_file(file, &file_reader); - - // handle the case that the file is not found but actually exists in other type format - // TODO(zhengyu): nasty! better eliminate the type encoding in file name in the future - if (!s.ok() && !s.is()) { - LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); - return s; // return other error directly - } else if (!s.ok() && s.is()) { // but handle NOT_FOUND error - auto candidates = get_path_in_local_cache_all_candidates( - get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset); - for (auto& candidate : candidates) { - s = fs->open_file(candidate, &file_reader); - if (s.ok()) { - break; // success with one of there candidates + if (!s.ok()) { + if (s.is()) { + // Try to open file with old v2 format + std::string dir = get_path_in_local_cache_v2(key.hash, key.meta.expiration_time); + std::string v2_file = get_path_in_local_cache_v2(dir, key.offset, key.meta.type); + Status s2 = fs->open_file(v2_file, &file_reader); + if (!s2.ok()) { + LOG(WARNING) << "open file failed with both v3 and v2 format, v3_file=" << file + << ", v2_file=" << v2_file << ", error=" << s2.to_string(); + return s2; } - } - if (!s.ok()) { // still not found, return error + } else { LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); return s; } - } // else, s.ok() means open file success + } FDCache::instance()->insert_file_reader(fd_key, file_reader); } @@ -220,23 +238,20 @@ Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Sl } Status FSFileCacheStorage::remove(const FileCacheKey& key) { - std::string dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); - std::string file = get_path_in_local_cache(dir, key.offset, key.meta.type); + std::string dir = get_path_in_local_cache_v3(key.hash); + std::string file = get_path_in_local_cache_v3(dir, key.offset); FDCache::instance()->remove_file_reader(std::make_pair(key.hash, key.offset)); RETURN_IF_ERROR(fs->delete_file(file)); // return OK not means the file is deleted, it may be not exist - // So for TTL, we make sure the old format will be removed well - if (key.meta.type == FileCacheType::TTL) { - bool exists {false}; - // try to detect the file with old ttl format - file = get_path_in_local_cache_old_ttl_format(dir, key.offset, key.meta.type); - RETURN_IF_ERROR(fs->exists(file, &exists)); - if (exists) { - VLOG(7) << "try to remove the file with old ttl format" - << " file=" << file; - RETURN_IF_ERROR(fs->delete_file(file)); - } + + { // try to detect the file with old v2 format + dir = get_path_in_local_cache_v2(key.hash, key.meta.expiration_time); + file = get_path_in_local_cache_v2(dir, key.offset, key.meta.type); + RETURN_IF_ERROR(fs->delete_file(file)); } + + BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); + _meta_store->delete_key(mkey); std::vector files; bool exists {false}; RETURN_IF_ERROR(fs->list(dir, true, &files, &exists)); @@ -246,43 +261,39 @@ Status FSFileCacheStorage::remove(const FileCacheKey& key) { return Status::OK(); } -Status FSFileCacheStorage::change_key_meta_type(const FileCacheKey& key, const FileCacheType type) { +Status FSFileCacheStorage::change_key_meta_type(const FileCacheKey& key, const FileCacheType type, + const size_t size) { // file operation if (key.meta.type != type) { - // TTL type file dose not need to change the suffix - bool expr = (key.meta.type != FileCacheType::TTL && type != FileCacheType::TTL); - if (!expr) { - LOG(WARNING) << "TTL type file dose not need to change the suffix" - << " key=" << key.hash.to_string() << " offset=" << key.offset - << " old_type=" << cache_type_to_string(key.meta.type) - << " new_type=" << cache_type_to_string(type); - } - DCHECK(expr); - std::string dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); - std::string original_file = get_path_in_local_cache(dir, key.offset, key.meta.type); - std::string new_file = get_path_in_local_cache(dir, key.offset, type); - RETURN_IF_ERROR(fs->rename(original_file, new_file)); + BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); + BlockMeta meta(type, size, key.meta.expiration_time); + _meta_store->put(mkey, meta); } return Status::OK(); } Status FSFileCacheStorage::change_key_meta_expiration(const FileCacheKey& key, - const uint64_t expiration) { - // directory operation + const uint64_t expiration, + const size_t size) { if (key.meta.expiration_time != expiration) { - std::string original_dir = get_path_in_local_cache(key.hash, key.meta.expiration_time); - std::string new_dir = get_path_in_local_cache(key.hash, expiration); - // It will be concurrent, but we don't care who rename - Status st = fs->rename(original_dir, new_dir); - if (!st.ok() && !st.is()) { - return st; - } + BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); + BlockMeta meta(key.meta.type, size, expiration); + _meta_store->put(mkey, meta); } return Status::OK(); } -std::string FSFileCacheStorage::get_path_in_local_cache(const std::string& dir, size_t offset, - FileCacheType type, bool is_tmp) { +std::string FSFileCacheStorage::get_path_in_local_cache_v3(const std::string& dir, size_t offset, + bool is_tmp) { + if (is_tmp) { + return Path(dir) / (std::to_string(offset) + "_tmp"); + } else { + return Path(dir) / std::to_string(offset); + } +} + +std::string FSFileCacheStorage::get_path_in_local_cache_v2(const std::string& dir, size_t offset, + FileCacheType type, bool is_tmp) { if (is_tmp) { return Path(dir) / (std::to_string(offset) + "_tmp"); } else if (type == FileCacheType::TTL) { @@ -292,35 +303,24 @@ std::string FSFileCacheStorage::get_path_in_local_cache(const std::string& dir, } } -std::string FSFileCacheStorage::get_path_in_local_cache_old_ttl_format(const std::string& dir, - size_t offset, - FileCacheType type, - bool is_tmp) { - DCHECK(type == FileCacheType::TTL); - return Path(dir) / (std::to_string(offset) + cache_type_to_surfix(type)); -} - -std::vector FSFileCacheStorage::get_path_in_local_cache_all_candidates( - const std::string& dir, size_t offset) { - std::vector candidates; - std::string base = get_path_in_local_cache(dir, offset, FileCacheType::NORMAL); - candidates.push_back(base); - candidates.push_back(base + "_idx"); - candidates.push_back(base + "_ttl"); - candidates.push_back(base + "_disposable"); - return candidates; +std::string FSFileCacheStorage::get_path_in_local_cache_v3(const UInt128Wrapper& value) const { + auto str = value.to_string(); + try { + return Path(_cache_base_path) / str.substr(0, KEY_PREFIX_LENGTH) / (str + "_0"); + } catch (std::filesystem::filesystem_error& e) { + LOG_WARNING("fail to get_path_in_local_cache") + .tag("err", e.what()) + .tag("key", value.to_string()); + return ""; + } } -std::string FSFileCacheStorage::get_path_in_local_cache(const UInt128Wrapper& value, - uint64_t expiration_time) const { +std::string FSFileCacheStorage::get_path_in_local_cache_v2(const UInt128Wrapper& value, + uint64_t expiration_time) const { auto str = value.to_string(); try { - if constexpr (USE_CACHE_VERSION2) { - return Path(_cache_base_path) / str.substr(0, KEY_PREFIX_LENGTH) / - (str + "_" + std::to_string(expiration_time)); - } else { - return Path(_cache_base_path) / (str + "_" + std::to_string(expiration_time)); - } + return Path(_cache_base_path) / str.substr(0, KEY_PREFIX_LENGTH) / + (str + "_" + std::to_string(expiration_time)); } catch (std::filesystem::filesystem_error& e) { LOG_WARNING("fail to get_path_in_local_cache") .tag("err", e.what()) @@ -424,122 +424,31 @@ Status FSFileCacheStorage::collect_directory_entries(const std::filesystem::path } Status FSFileCacheStorage::upgrade_cache_dir_if_necessary() const { - /* - * If use version2 but was version 1, do upgrade: - * - * Action I: - * version 1.0: cache_base_path / key / offset - * version 2.0: cache_base_path / key_prefix / key / offset - * - * Action II: - * add '_0' to hash dir - * - * Note: This is a sync operation with tons of IOs, so it may affect BE - * boot time heavily. Fortunately, Action I & II will only happen when - * upgrading (once in the cluster life time). - */ - std::string version; - std::error_code ec; int rename_count = 0; int failure_count = 0; auto start_time = std::chrono::steady_clock::now(); RETURN_IF_ERROR(read_file_cache_version(&version)); - LOG(INFO) << "Checking cache version upgrade. Current version: " << version - << ", target version: 2.0, need upgrade: " - << (USE_CACHE_VERSION2 && version != "2.0"); - if (USE_CACHE_VERSION2 && version != "2.0") { - // move directories format as version 2.0 - std::vector file_list; - file_list.reserve(10000); - RETURN_IF_ERROR(collect_directory_entries(_cache_base_path, file_list)); - - // this directory_iterator should be a problem in concurrent access - for (const auto& file_path : file_list) { - try { - if (std::filesystem::is_directory(file_path)) { - std::string cache_key = std::filesystem::path(file_path).filename().native(); - if (cache_key.size() > KEY_PREFIX_LENGTH) { - if (cache_key.find('_') == std::string::npos) { - cache_key += "_0"; - } - std::string key_prefix = - Path(_cache_base_path) / cache_key.substr(0, KEY_PREFIX_LENGTH); - bool exists = false; - auto exists_status = fs->exists(key_prefix, &exists); - if (!exists_status.ok()) { - LOG(WARNING) << "Failed to check directory existence: " << key_prefix - << ", error: " << exists_status.to_string(); - ++failure_count; - continue; - } - if (!exists) { - auto create_status = fs->create_directory(key_prefix); - if (!create_status.ok() && - create_status.code() != TStatusCode::type::ALREADY_EXIST) { - LOG(WARNING) << "Failed to create directory: " << key_prefix - << ", error: " << create_status.to_string(); - ++failure_count; - continue; - } - } - auto rename_status = Status::OK(); - const std::string new_file_path = key_prefix + "/" + cache_key; - TEST_SYNC_POINT_CALLBACK( - "FSFileCacheStorage::upgrade_cache_dir_if_necessary_rename", - &file_path, &new_file_path); - rename_status = fs->rename(file_path, new_file_path); - if (rename_status.ok() || - rename_status.code() == TStatusCode::type::DIRECTORY_NOT_EMPTY) { - ++rename_count; - } else { - LOG(WARNING) - << "Failed to rename directory from " << file_path << " to " - << new_file_path << ", error: " << rename_status.to_string(); - ++failure_count; - continue; - } - } - } - } catch (const std::exception& e) { - LOG(WARNING) << "Error occurred while upgrading file cache directory: " << file_path - << " err: " << e.what(); - ++failure_count; - } - } - - std::vector rebuilt_file_list; - rebuilt_file_list.reserve(10000); - RETURN_IF_ERROR(collect_directory_entries(_cache_base_path, rebuilt_file_list)); - - for (const auto& key_it : rebuilt_file_list) { - if (!std::filesystem::is_directory(key_it)) { - // maybe version hits file - continue; - } - try { - if (Path(key_it).filename().native().size() != KEY_PREFIX_LENGTH) { - LOG(WARNING) << "Unknown directory " << key_it << ", try to remove it"; - auto delete_status = fs->delete_directory(key_it); - if (!delete_status.ok()) { - LOG(WARNING) << "Failed to delete unknown directory: " << key_it - << ", error: " << delete_status.to_string(); - ++failure_count; - continue; - } - } - } catch (const std::exception& e) { - LOG(WARNING) << "Error occurred while upgrading file cache directory: " << key_it - << " err: " << e.what(); - ++failure_count; - } - } - if (auto st = write_file_cache_version(); !st.ok()) { - return Status::InternalError("Failed to write version hints for file cache, err={}", - st.to_string()); - } + if (version == "1.0") { + LOG(ERROR) << "Cache version upgrade issue: Cannot upgrade directly from 1.0 to 3.0.Please " + "upgrade to 2.0 first (>= doris-3.0.0),or clear the file cache directory to " + "start anew " + "(LOSING ALL THE CACHE)."; + exit(-1); + } else if (version == "2.0") { + LOG(INFO) << "Cache will upgrade from 2.0 to 3.0 progressively during running. 2.0 data " + "format will evict eventually."; + return Status::OK(); + } else if (version == "3.0") { + LOG(INFO) << "Readly 3.0 format, no need to upgrade."; + return Status::OK(); + } else { + LOG(ERROR) << "Cache version upgrade issue: current version " << version + << " is not valid. Clear the file cache directory to start anew (LOSING ALL THE " + "CACHE)."; + exit(-1); } auto end_time = std::chrono::steady_clock::now(); @@ -551,15 +460,30 @@ Status FSFileCacheStorage::upgrade_cache_dir_if_necessary() const { } Status FSFileCacheStorage::write_file_cache_version() const { - if constexpr (USE_CACHE_VERSION2) { - std::string version_path = get_version_path(); - Slice version("2.0"); - FileWriterPtr version_writer; - RETURN_IF_ERROR(fs->create_file(version_path, &version_writer)); - RETURN_IF_ERROR(version_writer->append(version)); - return version_writer->close(); - } - return Status::OK(); + std::string version_path = get_version_path(); + + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + + // Add version field to JSON + rapidjson::Value version_value; + version_value.SetString("3.0", allocator); + doc.AddMember("version", version_value, allocator); + + // Serialize JSON to string + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc.Accept(writer); + + // Combine version string with JSON for backward compatibility + std::string version_content = "3.0" + std::string(buffer.GetString(), buffer.GetSize()); + Slice version_slice(version_content); + + FileWriterPtr version_writer; + RETURN_IF_ERROR(fs->create_file(version_path, &version_writer)); + RETURN_IF_ERROR(version_writer->append(version_slice)); + return version_writer->close(); } Status FSFileCacheStorage::read_file_cache_version(std::string* buffer) const { @@ -567,7 +491,7 @@ Status FSFileCacheStorage::read_file_cache_version(std::string* buffer) const { bool exists = false; RETURN_IF_ERROR(fs->exists(version_path, &exists)); if (!exists) { - *buffer = "1.0"; + *buffer = "2.0"; // return 2.0 if not exist to utilize filesystem return Status::OK(); } FileReaderSPtr version_reader; @@ -578,6 +502,19 @@ Status FSFileCacheStorage::read_file_cache_version(std::string* buffer) const { size_t bytes_read = 0; RETURN_IF_ERROR(version_reader->read_at(0, Slice(buffer->data(), file_size), &bytes_read)); RETURN_IF_ERROR(version_reader->close()); + + // Extract only the version number part (before JSON starts) for backward compatibility + // New format: "3.0{\"version\":\"3.0\"}", old format: "3.0" + std::string content = *buffer; + size_t json_start = content.find('{'); + if (json_start != std::string::npos) { + // New format with JSON, extract version number only + *buffer = content.substr(0, json_start); + } else { + // Old format, keep as is + *buffer = content; + } + auto st = Status::OK(); TEST_SYNC_POINT_CALLBACK("FSFileCacheStorage::read_file_cache_version", &st); return st; @@ -653,7 +590,7 @@ Status FSFileCacheStorage::parse_filename_suffix_to_cache_type( return Status::OK(); } -void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const { +void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const { int scan_length = 10000; std::vector batch_load_buffer; batch_load_buffer.reserve(scan_length); @@ -732,98 +669,270 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const } }; std::error_code ec; - if constexpr (USE_CACHE_VERSION2) { - TEST_SYNC_POINT_CALLBACK("BlockFileCache::BeforeScan"); - std::filesystem::directory_iterator key_prefix_it {_cache_base_path, ec}; + + TEST_SYNC_POINT_CALLBACK("BlockFileCache::BeforeScan"); + std::filesystem::directory_iterator key_prefix_it {_cache_base_path, ec}; + if (ec) { + LOG(WARNING) << ec.message(); + return; + } + for (; key_prefix_it != std::filesystem::directory_iterator(); ++key_prefix_it) { + if (!key_prefix_it->is_directory()) { + // skip version file + continue; + } + if (key_prefix_it->path().filename().native() == "meta") { + // skip rocksdb dir + continue; + } + if (key_prefix_it->path().filename().native().size() != KEY_PREFIX_LENGTH) { + LOG(WARNING) << "Unknown directory " << key_prefix_it->path().native() + << ", try to remove it"; + std::filesystem::remove(key_prefix_it->path(), ec); + if (ec) { + LOG(WARNING) << "failed to remove=" << key_prefix_it->path() + << " msg=" << ec.message(); + } + continue; + } + std::filesystem::directory_iterator key_it {key_prefix_it->path(), ec}; if (ec) { LOG(WARNING) << ec.message(); - return; + continue; } - for (; key_prefix_it != std::filesystem::directory_iterator(); ++key_prefix_it) { - if (!key_prefix_it->is_directory()) { - // skip version file - continue; + scan_file_cache(key_it); + } + + if (!batch_load_buffer.empty()) { + add_cell_batch_func(); + } + TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile2"); +} + +Status FSFileCacheStorage::get_file_cache_infos(std::vector& infos, + std::lock_guard& cache_lock) const { + std::error_code ec; + std::filesystem::directory_iterator key_prefix_it {_cache_base_path, ec}; + if (ec) [[unlikely]] { + LOG(ERROR) << fmt::format("Failed to list dir {}, err={}", _cache_base_path, ec.message()); + return Status::InternalError("Failed to list dir {}, err={}", _cache_base_path, + ec.message()); + } + // Only supports version 2. For more details, refer to 'USE_CACHE_VERSION2'. + for (; key_prefix_it != std::filesystem::directory_iterator(); ++key_prefix_it) { + if (!key_prefix_it->is_directory()) { + // skip version file + continue; + } + if (key_prefix_it->path().filename().native().size() != KEY_PREFIX_LENGTH) { + LOG(WARNING) << "Unknown directory " << key_prefix_it->path().native(); + continue; + } + std::filesystem::directory_iterator key_it {key_prefix_it->path(), ec}; + if (ec) [[unlikely]] { + LOG(ERROR) << fmt::format("Failed to list dir {}, err={}", + key_prefix_it->path().filename().native(), ec.message()); + return Status::InternalError("Failed to list dir {}, err={}", + key_prefix_it->path().filename().native(), ec.message()); + } + for (; key_it != std::filesystem::directory_iterator(); ++key_it) { + auto key_with_suffix = key_it->path().filename().native(); + auto delim_pos = key_with_suffix.find('_'); + DCHECK(delim_pos != std::string::npos); + std::string key_str = key_with_suffix.substr(0, delim_pos); + std::string expiration_time_str = key_with_suffix.substr(delim_pos + 1); + long expiration_time = std::stoul(expiration_time_str); + auto hash = UInt128Wrapper(vectorized::unhex_uint(key_str.c_str())); + std::filesystem::directory_iterator offset_it(key_it->path(), ec); + if (ec) [[unlikely]] { + LOG(ERROR) << fmt::format("Failed to list dir {}, err={}", + key_it->path().filename().native(), ec.message()); + return Status::InternalError("Failed to list dir {}, err={}", + key_it->path().filename().native(), ec.message()); } - if (key_prefix_it->path().filename().native().size() != KEY_PREFIX_LENGTH) { - LOG(WARNING) << "Unknown directory " << key_prefix_it->path().native() - << ", try to remove it"; - std::error_code ec; - std::filesystem::remove(key_prefix_it->path(), ec); - if (ec) { - LOG(WARNING) << "failed to remove=" << key_prefix_it->path() - << " msg=" << ec.message(); + for (; offset_it != std::filesystem::directory_iterator(); ++offset_it) { + size_t size = offset_it->file_size(ec); + if (ec) [[unlikely]] { + LOG(ERROR) << fmt::format("Failed to get file size, file name {}, err={}", + key_it->path().filename().native(), ec.message()); + return Status::InternalError("Failed to get file size, file name {}, err={}", + key_it->path().filename().native(), ec.message()); } - continue; - } - std::filesystem::directory_iterator key_it {key_prefix_it->path(), ec}; - if (ec) { - LOG(WARNING) << ec.message(); - continue; + size_t offset = 0; + bool is_tmp = false; + FileCacheType cache_type = FileCacheType::NORMAL; + RETURN_IF_ERROR(this->parse_filename_suffix_to_cache_type( + fs, offset_it->path().filename().native(), expiration_time, size, &offset, + &is_tmp, &cache_type)); + infos.emplace_back(hash, expiration_time, size, offset, is_tmp, cache_type); } - scan_file_cache(key_it); } - } else { - std::filesystem::directory_iterator key_it {_cache_base_path, ec}; - if (ec) { - LOG(WARNING) << ec.message(); + } + return Status::OK(); +} + +void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const { + TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile1"); + int scan_length = 10000; + std::vector batch_load_buffer; + batch_load_buffer.reserve(scan_length); + auto add_cell_batch_func = [&]() { + SCOPED_CACHE_LOCK(_mgr->_mutex, _mgr); + + auto f = [&](const BatchLoadArgs& args) { + // in async load mode, a cell may be added twice. + if (_mgr->_files.contains(args.hash) && _mgr->_files[args.hash].contains(args.offset)) { + auto block = _mgr->_files[args.hash][args.offset].file_block; + if (block->tablet_id() == 0) { + block->set_tablet_id(args.ctx.tablet_id); + } + if (block->cache_type() == io::FileCacheType::TTL && + block->expiration_time() != args.ctx.expiration_time) { + auto s = block->update_expiration_time(args.ctx.expiration_time); + if (!s.ok()) { + LOG(WARNING) << "update expiration time for " << args.hash.to_string() + << " offset=" << args.offset; + } + } + return; + } + _mgr->add_cell(args.hash, args.ctx, args.offset, args.size, + FileBlock::State::DOWNLOADED, cache_lock); return; + }; + std::for_each(batch_load_buffer.begin(), batch_load_buffer.end(), f); + batch_load_buffer.clear(); + }; + + auto iterator = _meta_store->get_all(); + if (!iterator) { + LOG(WARNING) << "Failed to create iterator for meta store"; + return; + } + + while (iterator->valid()) { + BlockMetaKey meta_key = iterator->key(); + BlockMeta meta_value = iterator->value(); + + // Check for deserialization errors + if (!iterator->get_last_key_error().ok() || !iterator->get_last_value_error().ok()) { + LOG(WARNING) << "Failed to deserialize cache block metadata: " + << "key_error=" << iterator->get_last_key_error().to_string() + << ", value_error=" << iterator->get_last_value_error().to_string(); + iterator->next(); + continue; // Skip invalid records } - scan_file_cache(key_it); + + VLOG_DEBUG << "Processing cache block: tablet_id=" << meta_key.tablet_id + << ", hash=" << meta_key.hash.low() << "-" << meta_key.hash.high() + << ", offset=" << meta_key.offset << ", type=" << meta_value.type + << ", size=" << meta_value.size << ", ttl=" << meta_value.ttl; + + BatchLoadArgs args; + args.hash = meta_key.hash; + args.offset = meta_key.offset; + args.size = meta_value.size; + args.is_tmp = false; + + CacheContext ctx; + ctx.cache_type = static_cast(meta_value.type); + ctx.expiration_time = meta_value.ttl; + ctx.tablet_id = + meta_key.tablet_id; //TODO(zhengyu): zero if loaded from v2, we can use this to decide whether the block is loaded from v2 or v3 + args.ctx = ctx; + + args.key_path = ""; + args.offset_path = ""; + + batch_load_buffer.push_back(std::move(args)); + + if (batch_load_buffer.size() >= scan_length) { + add_cell_batch_func(); + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } + + iterator->next(); } + + LOG(INFO) << "Finished loading cache info from meta store using RocksDB iterator"; + if (!batch_load_buffer.empty()) { add_cell_batch_func(); } TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile2"); } +void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const { + // First load from database + load_cache_info_into_memory_from_db(_mgr); + + std::string version; + auto st = read_file_cache_version(&version); + if (!st.ok()) { + LOG(WARNING) << "Failed to read file cache version: " << st.to_string(); + return; + } + if (version == "3.0") { + return; + } + + // Count blocks loaded from database + size_t db_block_count = 0; + { + std::lock_guard lock(_mgr->_mutex); + for (const auto& hash_entry : _mgr->_files) { + db_block_count += hash_entry.second.size(); + } + } + + // Estimate file count from filesystem using statfs + size_t estimated_file_count = estimate_file_count_from_statfs(); + + LOG(INFO) << "Cache loading statistics - DB blocks: " << db_block_count + << ", Estimated FS files: " << estimated_file_count; + + // If the difference is more than threshold, load from filesystem as well + if (estimated_file_count > 0) { + double difference_ratio = + static_cast(estimated_file_count) - + static_cast(db_block_count) / static_cast(estimated_file_count); + + if (difference_ratio > config::file_cache_meta_store_vs_file_system_diff_num_threshold) { + LOG(WARNING) << "Significant difference between DB blocks (" << db_block_count + << ") and estimated FS files (" << estimated_file_count + << "), difference ratio: " << difference_ratio * 100 << "%" + << ". Loading from filesystem as well."; + load_cache_info_into_memory_from_fs(_mgr); + } else { + LOG(INFO) << "DB and FS counts are consistent, difference ratio: " + << difference_ratio * 100 << "%, skipping FS load."; + if (st = write_file_cache_version(); !st.ok()) { + LOG(WARNING) << "Failed to write version hints for file cache, err=" + << st.to_string(); + } + // TODO(zhengyu): use anti-leak machinism to remove v2 format directory + } + } +} + void FSFileCacheStorage::load_blocks_directly_unlocked(BlockFileCache* mgr, const FileCacheKey& key, std::lock_guard& cache_lock) { - // async load, can't find key, need to check exist. - auto key_path = get_path_in_local_cache(key.hash, key.meta.expiration_time); - bool exists = false; - auto st = fs->exists(key_path, &exists); - if (auto st = fs->exists(key_path, &exists); !exists && st.ok()) { + BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); + auto block_meta = _meta_store->get(mkey); + if (!block_meta.has_value()) { // cache miss return; - } else if (!st.ok()) [[unlikely]] { - LOG_WARNING("failed to exists file {}", key_path).error(st); - return; } CacheContext context_original; context_original.query_id = TUniqueId(); - context_original.expiration_time = key.meta.expiration_time; - std::error_code ec; - std::filesystem::directory_iterator check_it(key_path, ec); - if (ec) [[unlikely]] { - LOG(WARNING) << "fail to directory_iterator " << ec.message(); - return; - } - for (; check_it != std::filesystem::directory_iterator(); ++check_it) { - size_t size = check_it->file_size(ec); - size_t offset = 0; - bool is_tmp = false; - FileCacheType cache_type = FileCacheType::NORMAL; - if (!parse_filename_suffix_to_cache_type(fs, check_it->path().filename().native(), - context_original.expiration_time, size, &offset, - &is_tmp, &cache_type)) { - continue; - } - if (!mgr->_files.contains(key.hash) || !mgr->_files[key.hash].contains(offset)) { - // if the file is tmp, it means it is the old file and it should be removed - if (is_tmp) { - std::error_code ec; - std::filesystem::remove(check_it->path(), ec); - if (ec) { - LOG(WARNING) << fmt::format("cannot remove {}: {}", check_it->path().native(), - ec.message()); - } - } else { - context_original.cache_type = cache_type; - mgr->add_cell(key.hash, context_original, offset, size, - FileBlock::State::DOWNLOADED, cache_lock); - } - } + context_original.expiration_time = block_meta->ttl; + context_original.cache_type = static_cast(block_meta->type); + context_original.tablet_id = key.meta.tablet_id; + + if (!mgr->_files.contains(key.hash) || !mgr->_files[key.hash].contains(key.offset)) { + mgr->add_cell(key.hash, context_original, key.offset, block_meta->size, + FileBlock::State::DOWNLOADED, cache_lock); } } @@ -849,6 +958,7 @@ Status FSFileCacheStorage::clear(std::string& msg) { LOG(WARNING) << "failed to clear base_path=" << _cache_base_path << " path_to_delete=" << cache_key << " error=" << st; } + _meta_store->clear(); auto t1 = std::chrono::steady_clock::now(); std::stringstream ss; ss << "finished clear file storage, path=" << _cache_base_path @@ -863,8 +973,7 @@ Status FSFileCacheStorage::clear(std::string& msg) { } std::string FSFileCacheStorage::get_local_file(const FileCacheKey& key) { - return get_path_in_local_cache(get_path_in_local_cache(key.hash, key.meta.expiration_time), - key.offset, key.meta.type, false); + return get_path_in_local_cache_v3(get_path_in_local_cache_v3(key.hash), key.offset, false); } FSFileCacheStorage::~FSFileCacheStorage() { @@ -873,4 +982,40 @@ FSFileCacheStorage::~FSFileCacheStorage() { } } +size_t FSFileCacheStorage::estimate_file_count_from_statfs() const { + struct statvfs vfs; + if (statvfs(_cache_base_path.c_str(), &vfs) != 0) { + LOG(WARNING) << "Failed to get filesystem statistics for path: " << _cache_base_path + << ", error: " << strerror(errno); + return 0; + } + + // Get total size of cache directory to estimate file count + std::error_code ec; + uintmax_t total_size = 0; + for (const auto& entry : std::filesystem::recursive_directory_iterator(_cache_base_path, ec)) { + if (ec) { + LOG(WARNING) << "Error accessing directory entry: " << ec.message(); + continue; + } + if (entry.is_regular_file()) { + total_size += entry.file_size(); + } + } + + if (total_size == 0) { + return 0; + } + + // Estimate file count based on average file size + // Assuming average file size of 1MB for cache blocks + const uintmax_t average_file_size = 1024 * 1024; // 1MB + size_t estimated_file_count = total_size / average_file_size; + + LOG(INFO) << "Estimated file count for cache path " << _cache_base_path + << ": total_size=" << total_size << ", estimated_files=" << estimated_file_count; + + return estimated_file_count; +} + } // namespace doris::io diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index 114517bdf72706..daf0acef82d67c 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -23,6 +23,7 @@ #include #include +#include "io/cache/cache_block_meta_store.h" #include "io/cache/file_cache_common.h" #include "io/cache/file_cache_storage.h" #include "io/fs/file_reader.h" @@ -54,40 +55,43 @@ class FDCache { class FSFileCacheStorage : public FileCacheStorage { public: - /// use version 2 when USE_CACHE_VERSION2 = true, while use version 1 if false /// version 1.0: cache_base_path / key / offset /// version 2.0: cache_base_path / key_prefix / key / offset - static constexpr bool USE_CACHE_VERSION2 = true; static constexpr int KEY_PREFIX_LENGTH = 3; FSFileCacheStorage() = default; ~FSFileCacheStorage() override; Status init(BlockFileCache* _mgr) override; Status append(const FileCacheKey& key, const Slice& value) override; - Status finalize(const FileCacheKey& key) override; + Status finalize(const FileCacheKey& key, const size_t size) override; Status read(const FileCacheKey& key, size_t value_offset, Slice buffer) override; Status remove(const FileCacheKey& key) override; - Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type) override; - Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration) override; + Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, + const size_t size) override; + Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, + const size_t size) override; void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; std::string get_local_file(const FileCacheKey& key) override; - [[nodiscard]] static std::string get_path_in_local_cache(const std::string& dir, size_t offset, - FileCacheType type, - bool is_tmp = false); + [[nodiscard]] static std::string get_path_in_local_cache_v3(const std::string& dir, + size_t offset, bool is_tmp = false); - [[nodiscard]] static std::string get_path_in_local_cache_old_ttl_format(const std::string& dir, - size_t offset, - FileCacheType type, - bool is_tmp = false); + [[nodiscard]] std::string get_path_in_local_cache_v3(const UInt128Wrapper&) const; - [[nodiscard]] std::string get_path_in_local_cache(const UInt128Wrapper&, - uint64_t expiration_time) const; + [[nodiscard]] static std::string get_path_in_local_cache_v2(const std::string& dir, + size_t offset, FileCacheType type, + bool is_tmp = false); + + [[nodiscard]] std::string get_path_in_local_cache_v2(const UInt128Wrapper&, + uint64_t expiration_time) const; FileCacheStorageType get_type() override { return DISK; } + // Get the meta store instance (only available for DISK storage type) + CacheBlockMetaStore* get_meta_store() { return _meta_store.get(); } + private: void remove_old_version_directories(); @@ -109,8 +113,11 @@ class FSFileCacheStorage : public FileCacheStorage { void load_cache_info_into_memory(BlockFileCache* _mgr) const; - [[nodiscard]] std::vector get_path_in_local_cache_all_candidates( - const std::string& dir, size_t offset); +private: + // Helper function to count files in cache directory using statfs + size_t estimate_file_count_from_statfs() const; + void load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const; + void load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const; std::string _cache_base_path; std::thread _cache_background_load_thread; @@ -119,6 +126,7 @@ class FSFileCacheStorage : public FileCacheStorage { std::mutex _mtx; std::unordered_map _key_to_writer; std::shared_ptr _iterator_dir_retry_cnt; + std::unique_ptr _meta_store; }; } // namespace doris::io diff --git a/be/src/io/cache/mem_file_cache_storage.cpp b/be/src/io/cache/mem_file_cache_storage.cpp index 7e76dd5f88c565..b9af32a22acda4 100644 --- a/be/src/io/cache/mem_file_cache_storage.cpp +++ b/be/src/io/cache/mem_file_cache_storage.cpp @@ -63,7 +63,7 @@ Status MemFileCacheStorage::append(const FileCacheKey& key, const Slice& value) return Status::OK(); } -Status MemFileCacheStorage::finalize(const FileCacheKey& key) { +Status MemFileCacheStorage::finalize(const FileCacheKey& key, const size_t size) { // do nothing for in memory cache coz nothing to persist // download state in FileBlock::finalize will inform the readers when finish return Status::OK(); @@ -104,14 +104,15 @@ Status MemFileCacheStorage::remove(const FileCacheKey& key) { return Status::OK(); } -Status MemFileCacheStorage::change_key_meta_type(const FileCacheKey& key, - const FileCacheType type) { +Status MemFileCacheStorage::change_key_meta_type(const FileCacheKey& key, const FileCacheType type, + const size_t size) { // do nothing for in memory cache coz nothing to persist return Status::OK(); } Status MemFileCacheStorage::change_key_meta_expiration(const FileCacheKey& key, - const uint64_t expiration) { + const uint64_t expiration, + const size_t size) { // do nothing for in memory cache coz nothing to persist return Status::OK(); } diff --git a/be/src/io/cache/mem_file_cache_storage.h b/be/src/io/cache/mem_file_cache_storage.h index 82064c6e9edc78..8773e629e03c1d 100644 --- a/be/src/io/cache/mem_file_cache_storage.h +++ b/be/src/io/cache/mem_file_cache_storage.h @@ -36,11 +36,13 @@ class MemFileCacheStorage : public FileCacheStorage { ~MemFileCacheStorage() override; Status init(BlockFileCache* _mgr) override; Status append(const FileCacheKey& key, const Slice& value) override; - Status finalize(const FileCacheKey& key) override; + Status finalize(const FileCacheKey& key, const size_t size) override; Status read(const FileCacheKey& key, size_t value_offset, Slice buffer) override; Status remove(const FileCacheKey& key) override; - Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type) override; - Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration) override; + Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, + const size_t size) override; + Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, + const size_t size) override; void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; diff --git a/be/src/io/fs/hdfs_file_writer.cpp b/be/src/io/fs/hdfs_file_writer.cpp index 28c790e2ebe3a4..9237c795af9422 100644 --- a/be/src/io/fs/hdfs_file_writer.cpp +++ b/be/src/io/fs/hdfs_file_writer.cpp @@ -324,8 +324,9 @@ void HdfsFileWriter::BatchBuffer::clear() { // TODO(ByteYue): Refactor Upload Buffer to reduce this duplicate code void HdfsFileWriter::_write_into_local_file_cache() { + int64_t tablet_id = get_tablet_id(_path.native()).value_or(0); auto holder = _cache_builder->allocate_cache_holder(_bytes_appended - _batch_buffer.size(), - _batch_buffer.capacity()); + _batch_buffer.capacity(), tablet_id); size_t pos = 0; size_t data_remain_size = _batch_buffer.size(); for (auto& block : holder->file_blocks) { diff --git a/be/src/io/fs/local_file_system.cpp b/be/src/io/fs/local_file_system.cpp index 358ba7d5ff1ee4..131c7d99b38cec 100644 --- a/be/src/io/fs/local_file_system.cpp +++ b/be/src/io/fs/local_file_system.cpp @@ -60,6 +60,7 @@ Status LocalFileSystem::create_file_impl(const Path& file, FileWriterPtr* writer << ", sync_data: " << (opts ? opts->sync_file_data : true); TEST_SYNC_POINT_RETURN_WITH_VALUE("LocalFileSystem::create_file_impl", Status::IOError("inject io error")); + // O_TRUNC: if file already exists (last tmp), clear the content int fd = ::open(file.c_str(), O_TRUNC | O_WRONLY | O_CREAT | O_CLOEXEC, 0666); DBUG_EXECUTE_IF("LocalFileSystem.create_file_impl.open_file_failed", { // spare '.testfile' to make bad disk checker happy diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index 79e19967e77470..79c2b6e868aa1c 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -188,10 +188,12 @@ Status S3FileWriter::_build_upload_buffer() { // that this instance of S3FileWriter might have been destructed when we // try to do writing into file cache, so we make the lambda capture the variable // we need by value to extend their lifetime - builder.set_allocate_file_blocks_holder( - [builder = *_cache_builder, offset = _bytes_appended]() -> FileBlocksHolderPtr { - return builder.allocate_cache_holder(offset, config::s3_write_buffer_size); - }); + int64_t id = get_tablet_id(_obj_storage_path_opts.path.native()).value_or(0); + builder.set_allocate_file_blocks_holder([builder = *_cache_builder, + offset = _bytes_appended, + tablet_id = id]() -> FileBlocksHolderPtr { + return builder.allocate_cache_holder(offset, config::s3_write_buffer_size, tablet_id); + }); } RETURN_IF_ERROR(builder.build(&_pending_buf)); auto* buf = dynamic_cast(_pending_buf.get()); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index fee2bc94b60113..ff9f889a3c57ab 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -1045,10 +1045,9 @@ Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size cache_builder->_expiration_time == 0 && config::is_cloud_mode()) { auto size = *index_size + *segment_file_size; - auto holder = cache_builder->allocate_cache_holder(index_start, size); + auto holder = cache_builder->allocate_cache_holder(index_start, size, _tablet->tablet_id()); for (auto& segment : holder->file_blocks) { - static_cast( - segment->change_cache_type_between_normal_and_index(io::FileCacheType::INDEX)); + static_cast(segment->change_cache_type(io::FileCacheType::INDEX)); } } return Status::OK(); diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index 9e77d8f94e8864..f681d31a9af708 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -49,11 +49,7 @@ void download(io::FileBlockSPtr file_block, size_t size) { EXPECT_TRUE(file_block->append(result).ok()); EXPECT_TRUE(file_block->finalize().ok()); auto key_str = hash.to_string(); - auto subdir = FSFileCacheStorage::USE_CACHE_VERSION2 - ? fs::path(cache_base_path) / key_str.substr(0, 3) / - (key_str + "_" + std::to_string(file_block->expiration_time())) - : fs::path(cache_base_path) / - (key_str + "_" + std::to_string(file_block->expiration_time())); + auto subdir = fs::path(cache_base_path) / key_str.substr(0, 3) / (key_str + "_0"); ASSERT_TRUE(fs::exists(subdir)); } @@ -1504,13 +1500,12 @@ TEST_F(BlockFileCacheTest, change_cache_type) { std::string data(size, '0'); Slice result(data.data(), size); ASSERT_TRUE(blocks[0]->append(result).ok()); - ASSERT_TRUE( - blocks[0]->change_cache_type_between_normal_and_index(io::FileCacheType::INDEX)); + ASSERT_TRUE(blocks[0]->change_cache_type(io::FileCacheType::INDEX)); ASSERT_TRUE(blocks[0]->finalize().ok()); auto key_str = key.to_string(); auto subdir = fs::path(cache_base_path) / key_str.substr(0, 3) / (key_str + "_" + std::to_string(blocks[0]->expiration_time())); - ASSERT_TRUE(fs::exists(subdir / "0_idx")); + ASSERT_TRUE(fs::exists(subdir / "0")); } if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -1554,8 +1549,7 @@ TEST_F(BlockFileCacheTest, change_cache_type_memory_storage) { std::string data(size, '0'); Slice result(data.data(), size); ASSERT_TRUE(blocks[0]->append(result).ok()); - ASSERT_TRUE( - blocks[0]->change_cache_type_between_normal_and_index(io::FileCacheType::INDEX)); + ASSERT_TRUE(blocks[0]->change_cache_type(io::FileCacheType::INDEX)); ASSERT_TRUE(blocks[0]->finalize().ok()); } } @@ -2393,9 +2387,9 @@ TEST_F(BlockFileCacheTest, ttl_change_to_normal) { auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] if (auto storage = dynamic_cast(cache._storage.get()); storage != nullptr) { - std::string dir = storage->get_path_in_local_cache(key2, 0); + std::string dir = storage->get_path_in_local_cache_v2(key2, 0); EXPECT_TRUE(fs::exists( - storage->get_path_in_local_cache(dir, 50, io::FileCacheType::NORMAL))); + storage->get_path_in_local_cache_v2(dir, 50, io::FileCacheType::NORMAL))); } auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); @@ -2524,9 +2518,8 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time) { auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] if (auto storage = dynamic_cast(cache._storage.get()); storage != nullptr) { - std::string dir = storage->get_path_in_local_cache(key2, change_time); - EXPECT_TRUE( - fs::exists(storage->get_path_in_local_cache(dir, 50, io::FileCacheType::TTL))); + std::string dir = storage->get_path_in_local_cache_v3(key2); + EXPECT_TRUE(fs::exists(storage->get_path_in_local_cache_v3(dir, 50))); } auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); @@ -3451,17 +3444,13 @@ TEST_F(BlockFileCacheTest, append_many_time) { auto holder = cache.get_or_set(key, 0, 5, context); auto blocks = fromHolder(holder); assert_range(1, blocks[0], io::FileBlock::Range(0, 4), io::FileBlock::State::DOWNLOADED); - ASSERT_TRUE( - blocks[0]->change_cache_type_between_normal_and_index(FileCacheType::INDEX).ok()); + ASSERT_TRUE(blocks[0]->change_cache_type(FileCacheType::INDEX).ok()); if (auto storage = dynamic_cast(cache._storage.get()); storage != nullptr) { - auto dir = storage->get_path_in_local_cache(blocks[0]->get_hash_value(), - blocks[0]->expiration_time()); - EXPECT_TRUE(fs::exists(storage->get_path_in_local_cache(dir, blocks[0]->offset(), - blocks[0]->cache_type()))); + auto dir = storage->get_path_in_local_cache_v3(blocks[0]->get_hash_value()); + EXPECT_TRUE(fs::exists(storage->get_path_in_local_cache_v3(dir, blocks[0]->offset()))); } - ASSERT_TRUE( - blocks[0]->change_cache_type_between_normal_and_index(FileCacheType::INDEX).ok()); + ASSERT_TRUE(blocks[0]->change_cache_type(FileCacheType::INDEX).ok()); auto sp = SyncPoint::get_instance(); sp->enable_processing(); SyncPoint::CallbackGuard guard1; @@ -3474,10 +3463,8 @@ TEST_F(BlockFileCacheTest, append_many_time) { }, &guard1); { - ASSERT_FALSE(blocks[0] - ->change_cache_type_between_normal_and_index(FileCacheType::NORMAL) - .ok()); - EXPECT_EQ(blocks[0]->cache_type(), FileCacheType::INDEX); + ASSERT_TRUE(blocks[0]->change_cache_type(FileCacheType::NORMAL).ok()); + EXPECT_EQ(blocks[0]->cache_type(), FileCacheType::NORMAL); std::string buffer; buffer.resize(5); EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 5), 0).ok()); @@ -4259,7 +4246,7 @@ TEST_F(BlockFileCacheTest, test_async_load_with_error_file_1) { std::string dir; if (auto storage = dynamic_cast(cache._storage.get()); storage != nullptr) { - dir = storage->get_path_in_local_cache(key, 0); + dir = storage->get_path_in_local_cache_v2(key, 0); } sp->set_call_back("BlockFileCache::TmpFile1", [&](auto&&) { FileWriterPtr writer; @@ -4328,7 +4315,7 @@ TEST_F(BlockFileCacheTest, test_async_load_with_error_file_2) { std::string dir; if (auto storage = dynamic_cast(cache._storage.get()); storage != nullptr) { - dir = storage->get_path_in_local_cache(key, 0); + dir = storage->get_path_in_local_cache_v2(key, 0); } std::atomic_bool flag1 = false; std::atomic_bool flag2 = false; @@ -5206,7 +5193,8 @@ TEST_F(BlockFileCacheTest, change_cache_type2) { } */ -TEST_F(BlockFileCacheTest, test_load) { +TEST_F(BlockFileCacheTest, DISABLE_test_load) { + GTEST_SKIP(); // test both path formats when loading file cache into memory // old file path format, [hash]_[expiration]/[offset]_ttl // new file path format, [hash]_[expiration]/[offset] @@ -5230,8 +5218,7 @@ TEST_F(BlockFileCacheTest, test_load) { context.expiration_time = expiration; auto key = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); - std::string dir = cache_base_path + key.to_string().substr(0, 3) + "/" + key.to_string() + "_" + - std::to_string(expiration); + std::string dir = cache_base_path + key.to_string().substr(0, 3) + "/" + key.to_string() + "_0"; std::cout << dir << std::endl; auto st = global_local_filesystem()->create_directory(dir, false); if (!st.ok()) { @@ -5240,17 +5227,15 @@ TEST_F(BlockFileCacheTest, test_load) { } sp->set_call_back("BlockFileCache::BeforeScan", [&](auto&&) { FileWriterPtr writer; - ASSERT_TRUE(global_local_filesystem()->create_file(dir / "10086_ttl", &writer).ok()); + ASSERT_TRUE(global_local_filesystem()->create_file(dir / "10086", &writer).ok()); ASSERT_TRUE(writer->append(Slice("111", 3)).ok()); ASSERT_TRUE(writer->close().ok()); - // no suffix, but it is not NORMAL, instead it is TTL because the - // dirname contains non-zero expiration time ASSERT_TRUE(global_local_filesystem()->create_file(dir / "20086", &writer).ok()); ASSERT_TRUE(writer->append(Slice("222", 3)).ok()); ASSERT_TRUE(writer->close().ok()); - ASSERT_TRUE(global_local_filesystem()->create_file(dir / "30086_idx", &writer).ok()); + ASSERT_TRUE(global_local_filesystem()->create_file(dir / "30086", &writer).ok()); ASSERT_TRUE(writer->append(Slice("333", 3)).ok()); ASSERT_TRUE(writer->close().ok()); }); @@ -5282,17 +5267,17 @@ TEST_F(BlockFileCacheTest, test_load) { std::lock_guard cache_lock(m1); std::lock_guard block_lock(m2); cache.remove(blocks[0], cache_lock, block_lock); - ASSERT_FALSE(fs::exists(dir / "10086_ttl")); + ASSERT_FALSE(fs::exists(dir / "10086")); } { auto type = cache.dump_single_cache_type(key, 20086); - ASSERT_TRUE(type == "ttl"); + ASSERT_TRUE(type == "normal"); auto holder = cache.get_or_set(key, 20086, 3, context); auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); assert_range(1, blocks[0], io::FileBlock::Range(20086, 20086 + 3 - 1), io::FileBlock::State::DOWNLOADED); - ASSERT_TRUE(blocks[0]->cache_type() == io::FileCacheType::TTL); + ASSERT_TRUE(blocks[0]->cache_type() == io::FileCacheType::NORMAL); // OK, looks like old format is correctly loaded, let's read it std::string buffer; buffer.resize(3); @@ -5329,6 +5314,155 @@ TEST_F(BlockFileCacheTest, file_cache_path_storage_parse) { } } +//TODO(zhengyu): should be compatible with version3 format +TEST_F(BlockFileCacheTest, DISABLE_check_file_cache_consistency) { + GTEST_SKIP(); + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + settings.query_queue_size = 30; + settings.query_queue_elements = 5; + settings.index_queue_size = 30; + settings.index_queue_elements = 5; + settings.disposable_queue_size = 30; + settings.disposable_queue_elements = 5; + settings.capacity = 90; + settings.max_file_block_size = 30; + settings.max_query_cache_size = 30; + auto key1 = io::BlockFileCache::hash("key1"); + auto key2 = io::BlockFileCache::hash("key2"); + + io::BlockFileCache mgr(cache_base_path, settings); + ASSERT_TRUE(mgr.initialize()); + for (int i = 0; i < 100; i++) { + if (mgr.get_async_open_success()) { + break; + }; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + io::CacheContext cache_context; + ReadStatistics rstats; + cache_context.stats = &rstats; + cache_context.cache_type = io::FileCacheType::TTL; + cache_context.query_id = query_id; + cache_context.expiration_time = 0; + { + cache_context.cache_type = io::FileCacheType::NORMAL; + auto holder = mgr.get_or_set(key1, 0, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + std::vector result; + Status status = mgr.report_file_cache_inconsistency(result); + ASSERT_TRUE(result.empty()); + } + + { + auto holder = mgr.get_or_set(key1, 10, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(10, 18), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(10, 18), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + mgr._files[key1].erase(10); + } + + { + auto holder = mgr.get_or_set(key1, 20, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(20, 28), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(20, 28), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + auto* fs_file_cache_storage = dynamic_cast(mgr._storage.get()); + std::string dir_path = fs_file_cache_storage->get_path_in_local_cache_v2(key1, 0); + fs::path block_file_path = std::filesystem::path(dir_path) / "20"; + fs::remove(block_file_path); + } + + { + auto holder = mgr.get_or_set(key1, 30, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(30, 38), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(30, 38), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + auto* fs_file_cache_storage = dynamic_cast(mgr._storage.get()); + std::string dir_path = fs_file_cache_storage->get_path_in_local_cache_v2(key1, 0); + fs::path block_file_path = std::filesystem::path(dir_path) / "30"; + std::string data = "This is a test message."; + std::ofstream out_file(block_file_path, std::ios::out | std::ios::app); + out_file << data; + out_file.close(); + } + + { + auto holder = mgr.get_or_set(key1, 40, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(40, 48), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(40, 48), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + blocks[0]->_key.meta.type = io::FileCacheType::INDEX; + } + + int64_t expiration_time = UnixSeconds() + 120; + { + cache_context.cache_type = FileCacheType::TTL; + cache_context.expiration_time = expiration_time; + auto holder = mgr.get_or_set(key2, 0, 9, cache_context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + assert_range(2, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::DOWNLOADING); + download(blocks[0]); + blocks[0]->_key.meta.expiration_time = 0; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + std::vector results; + Status status = mgr.report_file_cache_inconsistency(results); + std::unordered_set expected_results = { + "File cache info in manager:\nHash: 62434304659ae12df53386481113dfe1\nExpiration Time: " + "0\nOffset: 0\nCache Type: ttl\nFile cache info in storage:\nHash: " + "62434304659ae12df53386481113dfe1\nExpiration Time: " + + std::to_string(expiration_time) + + "\nOffset: 0\nCache Type: " + "ttl\nInconsistency Reason: EXPIRATION_TIME_INCONSISTENT \n\n", + "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " + "0\nOffset: 30\nCache Type: normal\nFile cache info in storage:\nHash: " + "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 30\nCache Type: " + "normal\nInconsistency Reason: SIZE_INCONSISTENT \n\n", + "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " + "0\nOffset: 40\nCache Type: index\nFile cache info in storage:\nHash: " + "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 40\nCache Type: " + "normal\nInconsistency Reason: CACHE_TYPE_INCONSISTENT \n\n", + "File cache info in manager:\nHash: 00000000000000000000000000000000\nExpiration Time: " + "0\nOffset: 0\nCache Type: normal\nFile cache info in storage:\nHash: " + "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 10\nCache Type: " + "normal\nInconsistency Reason: NOT_LOADED \n\n", + "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " + "0\nOffset: 20\nCache Type: normal\nFile cache info in storage:\nHash: " + "00000000000000000000000000000000\nExpiration Time: 0\nOffset: 0\nCache Type: " + "normal\nInconsistency Reason: MISSING_IN_STORAGE \n\n"}; + ASSERT_EQ(results.size(), expected_results.size()); + for (const auto& result : results) { + ASSERT_TRUE(expected_results.contains(result)); + } +} + TEST_F(BlockFileCacheTest, populate_empty_cache_with_disposable) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -7338,7 +7472,9 @@ void copy_dir(const fs::path& sourceDir, const fs::path& destinationDir) { } } -TEST_F(BlockFileCacheTest, test_upgrade_cache_dir_version) { +//TODO(zhengyu): add v2 -> v3 upgrade test +TEST_F(BlockFileCacheTest, DISABLE_test_upgrade_cache_dir_version) { + GTEST_SKIP(); config::enable_evict_file_cache_in_advance = false; config::file_cache_enter_disk_resource_limit_mode_percent = 99; @@ -7677,6 +7813,8 @@ TEST_F(BlockFileCacheTest, test_upgrade_cache_dir_version) { } TEST_F(BlockFileCacheTest, cached_remote_file_reader_ttl_index) { + config::enable_evict_file_cache_in_advance = false; + config::file_cache_enter_disk_resource_limit_mode_percent = 99; std::string cache_base_path = caches_dir / "cached_remote_file_reader_ttl_index" / ""; if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -7727,7 +7865,6 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_ttl_index) { IOContext io_ctx; FileCacheStatistics stats; io_ctx.file_cache_stats = &stats; - io_ctx.is_index_data = true; int64_t cur_time = UnixSeconds(); io_ctx.expiration_time = cur_time + 120; size_t bytes_read {0}; @@ -7741,6 +7878,7 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_ttl_index) { LOG(INFO) << "disp:" << cache->_disposable_queue.cache_size; EXPECT_EQ(cache->_ttl_queue.cache_size, 1048576); EXPECT_EQ(cache->_index_queue.cache_size, 0); + EXPECT_EQ(cache->_normal_queue.cache_size, 0); EXPECT_TRUE(reader.close().ok()); EXPECT_TRUE(reader.closed()); @@ -7748,6 +7886,11 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_ttl_index) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } + // First clear the file caches properly + FileCacheFactory::instance()->clear_file_caches(true); + std::this_thread::sleep_for(std::chrono::seconds(1)); + + // Then clean up internal state (following the pattern from other tests) FileCacheFactory::instance()->_caches.clear(); FileCacheFactory::instance()->_path_to_cache.clear(); FileCacheFactory::instance()->_capacity = 0; @@ -7993,6 +8136,8 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_direct_read_bytes_check) { uint64_t org_g_read_cache_indirect_num = g_read_cache_indirect_num.get_value(); uint64_t org_g_read_cache_direct_partial_bytes = g_read_cache_direct_partial_bytes.get_value(); uint64_t org_g_read_cache_indirect_bytes = g_read_cache_indirect_bytes.get_value(); + config::enable_evict_file_cache_in_advance = false; + config::file_cache_enter_disk_resource_limit_mode_percent = 99; config::enable_read_cache_file_directly = true; if (fs::exists(cache_base_path)) { diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp new file mode 100644 index 00000000000000..2f08c173962c18 --- /dev/null +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -0,0 +1,421 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/tests/gtest_lru_file_cache.cpp +// and modified by Doris + +#include "block_file_cache_test_common.h" + +namespace doris::io { + +namespace { + +void verify_meta_key(CacheBlockMetaStore& meta_store, int64_t tablet_id, + const std::string& key_name, size_t offset, FileCacheType expected_type, + uint64_t ttl, size_t size) { + BlockMetaKey mkey(tablet_id, io::BlockFileCache::hash(key_name), offset); + auto meta = meta_store.get(mkey); + ASSERT_TRUE(meta.has_value()); + ASSERT_EQ(meta->type, expected_type); + ASSERT_EQ(meta->ttl, ttl); + ASSERT_EQ(meta->size, size); +} + +} // namespace + +TEST_F(BlockFileCacheTest, version3_add_remove_restart) { + config::enable_evict_file_cache_in_advance = false; + config::file_cache_enter_disk_resource_limit_mode_percent = 99; + config::file_cache_background_lru_dump_interval_ms = 3000; + config::file_cache_background_lru_dump_update_cnt_threshold = 0; + config::file_cache_background_lru_dump_tail_record_num = + 2; // only dump last 2, to check dump works with meta store + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 1; + io::FileCacheSettings settings; + + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 5000000; + settings.query_queue_elements = 50000; + settings.index_queue_size = 5000000; + settings.index_queue_elements = 50000; + settings.disposable_queue_size = 5000000; + settings.disposable_queue_elements = 50000; + settings.capacity = 20000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + uint64_t expiration_time = UnixSeconds() + 120; + + int i = 0; + { // cache1 + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + for (; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context1; + ReadStatistics rstats; + context1.stats = &rstats; + context1.cache_type = io::FileCacheType::NORMAL; + context1.query_id = query_id; + context1.tablet_id = 47; + auto key1 = io::BlockFileCache::hash("key1"); + + int64_t offset = 0; + + for (; offset < 500000; offset += 100000) { + auto holder = cache.get_or_set(key1, offset, 100000, context1); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context2; + context2.stats = &rstats; + context2.cache_type = io::FileCacheType::INDEX; + context2.query_id = query_id; + context2.tablet_id = 48; + auto key2 = io::BlockFileCache::hash("key2"); + + offset = 0; + + for (; offset < 500000; offset += 100000) { + auto holder = cache.get_or_set(key2, offset, 100000, context2); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + io::CacheContext context3; + context3.stats = &rstats; + context3.cache_type = io::FileCacheType::TTL; + context3.query_id = query_id; + context3.expiration_time = expiration_time; + context3.tablet_id = 49; + auto key3 = io::BlockFileCache::hash("key3"); + + offset = 0; + + for (; offset < 500000; offset += 100000) { + auto holder = cache.get_or_set(key3, offset, 100000, context3); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + + io::CacheContext context4; + context4.stats = &rstats; + context4.cache_type = io::FileCacheType::DISPOSABLE; + context4.query_id = query_id; + context4.tablet_id = 50; + auto key4 = io::BlockFileCache::hash("key4"); + + offset = 0; + + for (; offset < 500000; offset += 100000) { + auto holder = cache.get_or_set(key4, offset, 100000, context4); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + + assert_range(1, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(offset, offset + 99999), + io::FileBlock::State::DOWNLOADED); + + blocks.clear(); + } + ASSERT_EQ(cache.get_stats_unsafe()["disposable_queue_curr_size"], 500000); + ASSERT_EQ(cache.get_stats_unsafe()["ttl_queue_curr_size"], 500000); + ASSERT_EQ(cache.get_stats_unsafe()["index_queue_curr_size"], 500000); + ASSERT_EQ(cache.get_stats_unsafe()["normal_queue_curr_size"], 500000); + + // check the meta store to see the content + { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + // Check if storage is FSFileCacheStorage before accessing _meta_store + auto* fs_storage = dynamic_cast(cache._storage.get()); + ASSERT_NE(fs_storage, nullptr) + << "Expected FSFileCacheStorage but got different storage type"; + + auto& meta_store = fs_storage->_meta_store; + verify_meta_key(*meta_store, 47, "key1", 0, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 100000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 200000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 300000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 400000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 48, "key2", 0, FileCacheType::INDEX, 0, 100000); + verify_meta_key(*meta_store, 48, "key2", 100000, FileCacheType::INDEX, 0, 100000); + verify_meta_key(*meta_store, 48, "key2", 200000, FileCacheType::INDEX, 0, 100000); + verify_meta_key(*meta_store, 48, "key2", 300000, FileCacheType::INDEX, 0, 100000); + verify_meta_key(*meta_store, 48, "key2", 400000, FileCacheType::INDEX, 0, 100000); + verify_meta_key(*meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 100000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 200000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 300000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 400000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 50, "key4", 0, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 100000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 200000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 300000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 400000, FileCacheType::DISPOSABLE, 0, 100000); + } + + // all queue are filled, let's check the lru log records + ASSERT_EQ(cache._lru_recorder->_ttl_lru_log_queue.size_approx(), 5); + ASSERT_EQ(cache._lru_recorder->_index_lru_log_queue.size_approx(), 5); + ASSERT_EQ(cache._lru_recorder->_normal_lru_log_queue.size_approx(), 5); + ASSERT_EQ(cache._lru_recorder->_disposable_lru_log_queue.size_approx(), 5); + + // then check the log replay + std::this_thread::sleep_for(std::chrono::milliseconds( + 2 * config::file_cache_background_lru_log_replay_interval_ms)); + ASSERT_EQ(cache._lru_recorder->_shadow_ttl_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache._lru_recorder->_shadow_index_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache._lru_recorder->_shadow_normal_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache._lru_recorder->_shadow_disposable_queue.get_elements_num_unsafe(), 5); + + // do some REMOVE + { + cache.remove_if_cached(key2); // remove all element from index queue + } + + std::this_thread::sleep_for(std::chrono::milliseconds( + 2 * config::file_cache_background_lru_log_replay_interval_ms)); + ASSERT_EQ(cache._lru_recorder->_shadow_ttl_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache._lru_recorder->_shadow_index_queue.get_elements_num_unsafe(), 0); + ASSERT_EQ(cache._lru_recorder->_shadow_normal_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache._lru_recorder->_shadow_disposable_queue.get_elements_num_unsafe(), 5); + + // check the meta store to see the content + { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + auto* fs_storage = dynamic_cast(cache._storage.get()); + ASSERT_NE(fs_storage, nullptr) + << "Expected FSFileCacheStorage but got different storage type"; + + auto& meta_store = fs_storage->_meta_store; + verify_meta_key(*meta_store, 47, "key1", 0, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 100000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 200000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 300000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 400000, FileCacheType::NORMAL, 0, 100000); + + BlockMetaKey mkey(48, io::BlockFileCache::hash("key2"), 0); + auto meta = meta_store->get(mkey); + ASSERT_FALSE(meta.has_value()); + + verify_meta_key(*meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 100000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 200000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 300000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 400000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 50, "key4", 0, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 100000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 200000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 300000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 400000, FileCacheType::DISPOSABLE, 0, 100000); + } + std::this_thread::sleep_for( + std::chrono::milliseconds(2 * config::file_cache_background_lru_dump_interval_ms)); + } + + { // cache2 + // let's try restore + io::BlockFileCache cache2(cache_base_path, settings); + ASSERT_TRUE(cache2.initialize()); + for (i = 0; i < 100; i++) { + if (cache2.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ASSERT_TRUE(cache2.get_async_open_success()); + + // check the size of cache2 + ASSERT_EQ(cache2._ttl_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache2._index_queue.get_elements_num_unsafe(), 0); + ASSERT_EQ(cache2._normal_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache2._disposable_queue.get_elements_num_unsafe(), 5); + ASSERT_EQ(cache2._cur_cache_size, 1500000); + + // check meta store + { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + auto* fs_storage = dynamic_cast(cache2._storage.get()); + ASSERT_NE(fs_storage, nullptr) + << "Expected FSFileCacheStorage but got different storage type"; + + auto& meta_store = fs_storage->_meta_store; + verify_meta_key(*meta_store, 47, "key1", 0, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 100000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 200000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 300000, FileCacheType::NORMAL, 0, 100000); + verify_meta_key(*meta_store, 47, "key1", 400000, FileCacheType::NORMAL, 0, 100000); + + BlockMetaKey mkey(48, io::BlockFileCache::hash("key2"), 0); + auto meta = meta_store->get(mkey); + ASSERT_FALSE(meta.has_value()); + + verify_meta_key(*meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 100000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 200000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 300000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 49, "key3", 400000, FileCacheType::TTL, expiration_time, + 100000); + verify_meta_key(*meta_store, 50, "key4", 0, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 100000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 200000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 300000, FileCacheType::DISPOSABLE, 0, 100000); + verify_meta_key(*meta_store, 50, "key4", 400000, FileCacheType::DISPOSABLE, 0, 100000); + } + + // check blocks restored from lru dump get updated ttl and tablet_id + { + io::CacheContext context; + ReadStatistics rstats; + context.stats = &rstats; + context.cache_type = io::FileCacheType::TTL; + context.tablet_id = 49; + context.expiration_time = expiration_time; + auto key = io::BlockFileCache::hash("key3"); + + auto holder = + cache2.get_or_set(key, 0, 100000, context); // offset = 0 is restore from dump + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + auto block = blocks[0]; + ASSERT_EQ(block->tablet_id(), 49); + ASSERT_EQ(block->expiration_time(), expiration_time); + } + + // do some meta change - type + { + io::CacheContext context; + ReadStatistics rstats; + context.stats = &rstats; + context.cache_type = io::FileCacheType::NORMAL; + context.tablet_id = 47; + auto key = io::BlockFileCache::hash("key1"); + + auto holder = cache2.get_or_set(key, 300000, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + auto block = blocks[0]; + ASSERT_EQ(block->tablet_id(), 47); + + ASSERT_TRUE(blocks[0]->change_cache_type(io::FileCacheType::INDEX)); + } + // check the meta + { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + auto* fs_storage = dynamic_cast(cache2._storage.get()); + ASSERT_NE(fs_storage, nullptr) + << "Expected FSFileCacheStorage but got different storage type"; + + auto& meta_store = fs_storage->_meta_store; + verify_meta_key(*meta_store, 47, "key1", 300000, FileCacheType::INDEX, 0, 100000); + } + // change ttl + { + io::CacheContext context; + ReadStatistics rstats; + context.stats = &rstats; + context.cache_type = io::FileCacheType::TTL; + context.tablet_id = 49; + context.expiration_time = expiration_time + 3600; + auto key = io::BlockFileCache::hash("key3"); + + auto holder = + cache2.get_or_set(key, 0, 100000, context); // offset = 0 is restore from dump + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + auto block = blocks[0]; + ASSERT_EQ(block->expiration_time(), expiration_time + 3600); + } + // check the meta + { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + auto* fs_storage = dynamic_cast(cache2._storage.get()); + ASSERT_NE(fs_storage, nullptr) + << "Expected FSFileCacheStorage but got different storage type"; + + auto& meta_store = fs_storage->_meta_store; + verify_meta_key(*meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time + 3600, + 100000); + } + } + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + +//TODO(zhengyu): check lazy load +//TODO(zhengyu): check version2 start +//TODO(zhengyu): check version2 version3 mixed start + +} // namespace doris::io diff --git a/be/test/io/cache/cache_block_meta_store_test.cpp b/be/test/io/cache/cache_block_meta_store_test.cpp new file mode 100644 index 00000000000000..2e8eacfaa7dbb4 --- /dev/null +++ b/be/test/io/cache/cache_block_meta_store_test.cpp @@ -0,0 +1,860 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/cache_block_meta_store.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" + +namespace doris::io { + +class CacheBlockMetaStoreTest : public ::testing::Test { +protected: + void SetUp() override { + // Create a temporary directory for test database + test_db_path_ = std::filesystem::temp_directory_path() / "cache_block_meta_store_test"; + std::filesystem::remove_all(test_db_path_); + std::filesystem::create_directories(test_db_path_); + + meta_store_ = std::make_unique(test_db_path_.string()); + + ASSERT_NE(meta_store_, nullptr); + } + + void TearDown() override { + meta_store_.reset(); + std::filesystem::remove_all(test_db_path_); + } + + std::filesystem::path test_db_path_; + std::unique_ptr meta_store_; +}; + +TEST_F(CacheBlockMetaStoreTest, BasicPutAndGet) { + uint128_t hash1 = (static_cast(123) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + BlockMeta meta1(NORMAL, 1024, 3600); + + // Test put operation + meta_store_->put(key1, meta1); + + // Wait a bit for async operation to complete + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // Test get operation + auto result = meta_store_->get(key1); + EXPECT_TRUE(result.has_value()); + EXPECT_EQ(result->type, meta1.type); + EXPECT_EQ(result->size, meta1.size); + EXPECT_EQ(result->ttl, meta1.ttl); + + // Test non-existent key + uint128_t hash2 = (static_cast(999) << 64) | 999; + BlockMetaKey non_existent_key(999, UInt128Wrapper(hash2), 999); + auto non_existent_result = meta_store_->get(non_existent_key); + EXPECT_FALSE(non_existent_result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, MultiplePutsAndGets) { + const int num_keys = 10; + std::vector keys; + std::vector metas; + + // Create multiple keys and metas + for (int i = 0; i < num_keys; ++i) { + uint128_t hash = (static_cast(i) << 64) | (i * 100); + keys.emplace_back(1, UInt128Wrapper(hash), i * 1024); + FileCacheType type = static_cast(i % 3); + metas.emplace_back(type, 1024 * (i + 1), 3600 + i * 100); + meta_store_->put(keys[i], metas[i]); + } + + // Wait for async operations + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Verify all keys + for (int i = 0; i < num_keys; ++i) { + auto result = meta_store_->get(keys[i]); + EXPECT_TRUE(result.has_value()); + EXPECT_EQ(result->type, metas[i].type); + EXPECT_EQ(result->size, metas[i].size); + EXPECT_EQ(result->ttl, metas[i].ttl); + } +} + +TEST_F(CacheBlockMetaStoreTest, RangeQuery) { + const int num_tablets = 3; + const int blocks_per_tablet = 5; + + // Create data for multiple tablets + for (int tablet_id = 1; tablet_id <= num_tablets; ++tablet_id) { + for (int i = 0; i < blocks_per_tablet; ++i) { + uint128_t hash = + (static_cast(tablet_id * 100 + i) << 64) | (tablet_id * 200 + i); + BlockMetaKey key(tablet_id, UInt128Wrapper(hash), i * 1024); + FileCacheType type = static_cast(i % 2); + BlockMeta meta(type, 2048 * (i + 1), 3600 + i * 100); + meta_store_->put(key, meta); + } + } + + // Wait for async operations + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Test range query for each tablet + for (int tablet_id = 1; tablet_id <= num_tablets; ++tablet_id) { + auto iterator = meta_store_->range_get(tablet_id); + ASSERT_NE(iterator, nullptr) << "Failed to create iterator for tablet " << tablet_id; + + int count = 0; + while (iterator->valid()) { + BlockMetaKey key = iterator->key(); + BlockMeta value = iterator->value(); + + EXPECT_EQ(key.tablet_id, tablet_id); + EXPECT_TRUE(value.type == DISPOSABLE || value.type == NORMAL); + EXPECT_GT(value.size, 0); + + iterator->next(); + count++; + } + + EXPECT_EQ(count, blocks_per_tablet); + } + + // Test range query for non-existent tablet + auto iterator = meta_store_->range_get(999); + ASSERT_NE(iterator, nullptr) << "Failed to create iterator for non-existent tablet"; + EXPECT_FALSE(iterator->valid()); +} + +TEST_F(CacheBlockMetaStoreTest, DeleteOperation) { + uint128_t hash1 = (static_cast(123) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + BlockMeta meta1(NORMAL, 1024, 3600); + + // Put then delete + meta_store_->put(key1, meta1); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // Verify put worked + auto result = meta_store_->get(key1); + EXPECT_TRUE(result.has_value()); + EXPECT_EQ(result->type, meta1.type); + + // Delete the key + meta_store_->delete_key(key1); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify deletion + auto deleted_result = meta_store_->get(key1); + EXPECT_FALSE(deleted_result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, SerializationDeserialization) { + uint128_t hash3 = (static_cast(456789) << 64) | 987654; + BlockMetaKey original_key(123, UInt128Wrapper(hash3), 1024); + BlockMeta original_meta(INDEX, 4096, 7200); + + // Test round-trip through put and get operations + meta_store_->put(original_key, original_meta); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + auto retrieved = meta_store_->get(original_key); + EXPECT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->type, original_meta.type); + EXPECT_EQ(retrieved->size, original_meta.size); + EXPECT_EQ(retrieved->ttl, original_meta.ttl); + + // Test non-existent key + uint128_t hash4 = (static_cast(999999) << 64) | 888888; + BlockMetaKey non_existent_key(999, UInt128Wrapper(hash4), 2048); + auto non_existent_result = meta_store_->get(non_existent_key); + EXPECT_FALSE(non_existent_result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, ConcurrencyTest) { + const int num_threads = 4; + const int operations_per_thread = 100; + std::atomic successful_puts(0); + + // Store keys for later verification + std::vector all_keys; + std::mutex keys_mutex; + + auto worker = [&](int thread_id) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dist(1, 1000); + + for (int i = 0; i < operations_per_thread; ++i) { + int64_t tablet_id = thread_id + 1; + uint128_t hash_value = (static_cast(dist(gen)) << 64) | dist(gen); + size_t offset = i * 1024; + + BlockMetaKey key(tablet_id, UInt128Wrapper(hash_value), offset); + FileCacheType type = static_cast(thread_id % 3); + BlockMeta meta(type, 2048, 3600 + thread_id * 100 + i); + + // Put operation + meta_store_->put(key, meta); + successful_puts++; + + // Store key for later verification + { + std::lock_guard lock(keys_mutex); + all_keys.push_back(key); + } + } + }; + + std::vector threads; + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back(worker, i); + } + + for (auto& thread : threads) { + thread.join(); + } + + // Wait for all async operations to complete + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + + EXPECT_EQ(successful_puts, num_threads * operations_per_thread); + + // Verify we can retrieve the data after all writes are complete + int successful_gets = 0; + for (const auto& key : all_keys) { + auto result = meta_store_->get(key); + if (result.has_value() && result->size > 0) { + successful_gets++; + } + } + + EXPECT_GT(successful_gets, 0); + + // Verify we can retrieve some of the data + for (int thread_id = 0; thread_id < num_threads; ++thread_id) { + for (int i = 0; i < 5; ++i) { // Check a few samples + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(thread_id + 1, UInt128Wrapper(hash), i * 1024); + auto result = meta_store_->get(key); + if (result.has_value()) { + EXPECT_GE(result->size, 0); + EXPECT_GE(result->ttl, 0); + } + } + } +} + +TEST_F(CacheBlockMetaStoreTest, IteratorValidity) { + // Put some data + for (int i = 0; i < 5; ++i) { + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + FileCacheType type = static_cast(i % 2); + BlockMeta meta(type, 2048 * (i + 1), 3600 + i * 100); + meta_store_->put(key, meta); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Test iterator + auto iterator = meta_store_->range_get(1); + ASSERT_NE(iterator, nullptr) << "Failed to create iterator for tablet 1"; + int count = 0; + + while (iterator->valid()) { + BlockMetaKey key = iterator->key(); + BlockMeta value = iterator->value(); + + EXPECT_EQ(key.tablet_id, 1); + EXPECT_GE(key.offset, 0); + EXPECT_GT(value.size, 0); + + iterator->next(); + count++; + } + + EXPECT_EQ(count, 5); + + // Test that iterator becomes invalid after reaching end + EXPECT_FALSE(iterator->valid()); +} + +TEST_F(CacheBlockMetaStoreTest, KeySerialization) { + uint128_t hash4 = (static_cast(456789) << 64) | 987654; + BlockMetaKey key(123, UInt128Wrapper(hash4), 1024); + + // Test round-trip serialization + std::string serialized = serialize_key(key); + Status status; + auto deserialized = deserialize_key(serialized, &status); + + EXPECT_TRUE(deserialized.has_value()) << "Failed to deserialize key: " << status.to_string(); + EXPECT_TRUE(status.ok()) << "Deserialization failed with status: " << status.to_string(); + EXPECT_EQ(deserialized->tablet_id, key.tablet_id); + EXPECT_EQ(deserialized->hash, key.hash); + EXPECT_EQ(deserialized->offset, key.offset); + + // Verify version byte + EXPECT_EQ(serialized[0], 0x1); +} + +TEST_F(CacheBlockMetaStoreTest, KeyOrder) { + // Test that keys are properly ordered for the same tablet and hash + uint128_t hash = (static_cast(123) << 64) | 456; + + // Create keys with same tablet_id and hash, different offsets + BlockMetaKey key1(1, UInt128Wrapper(hash), 1); + BlockMetaKey key2(1, UInt128Wrapper(hash), 2); + BlockMetaKey key3(1, UInt128Wrapper(hash), 100); + + // Serialize all keys + std::string serialized1 = serialize_key(key1); + std::string serialized2 = serialize_key(key2); + std::string serialized3 = serialize_key(key3); + + // Verify that offset=1 comes before offset=2 + EXPECT_LT(serialized1, serialized2) + << "Key with offset=1 should come before offset=2 for same tablet and hash"; + + // Verify that offset=2 comes before offset=100 + EXPECT_LT(serialized2, serialized3) + << "Key with offset=2 should come before offset=100 for same tablet and hash"; + + // Verify that offset=1 comes before offset=100 + EXPECT_LT(serialized1, serialized3) + << "Key with offset=1 should come before offset=100 for same tablet and hash"; + + // Test with different tablet_ids + BlockMetaKey key4(2, UInt128Wrapper(hash), 1); + std::string serialized4 = serialize_key(key4); + + // Tablet 1 should come before tablet 2 + EXPECT_LT(serialized1, serialized4) + << "Key with tablet_id=1 should come before tablet_id=2 for same hash and offset"; + + // Test with different hashes but same tablet and offset + uint128_t hash2 = (static_cast(124) << 64) | 456; + BlockMetaKey key5(1, UInt128Wrapper(hash2), 1); + std::string serialized5 = serialize_key(key5); + + // Hash 123 should come before hash 124 + EXPECT_LT(serialized1, serialized5) + << "Key with hash=123 should come before hash=124 for same tablet and offset"; +} + +TEST_F(CacheBlockMetaStoreTest, BlockMetaEquality) { + BlockMeta meta1(NORMAL, 1024, 3600); + BlockMeta meta2(NORMAL, 1024, 3600); + BlockMeta meta3(INDEX, 1024, 3600); + BlockMeta meta4(NORMAL, 2048, 3600); + BlockMeta meta5(NORMAL, 1024, 7200); + + EXPECT_TRUE(meta1 == meta2); + EXPECT_FALSE(meta1 == meta3); + EXPECT_FALSE(meta1 == meta4); + EXPECT_FALSE(meta1 == meta5); +} + +TEST_F(CacheBlockMetaStoreTest, BlockMetaKeyEquality) { + uint128_t hash1 = (static_cast(123) << 64) | 456; + uint128_t hash2 = (static_cast(789) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + BlockMetaKey key2(1, UInt128Wrapper(hash1), 0); + BlockMetaKey key3(2, UInt128Wrapper(hash1), 0); + BlockMetaKey key4(1, UInt128Wrapper(hash2), 0); + BlockMetaKey key5(1, UInt128Wrapper(hash1), 1024); + + EXPECT_TRUE(key1 == key2); + EXPECT_FALSE(key1 == key3); + EXPECT_FALSE(key1 == key4); + EXPECT_FALSE(key1 == key5); +} + +TEST_F(CacheBlockMetaStoreTest, ClearAllRecords) { + // Add multiple records to the store + const int num_records = 10; + std::vector keys; + + for (int i = 0; i < num_records; ++i) { + uint128_t hash = (static_cast(i) << 64) | (i * 100); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + FileCacheType type = static_cast(i % 3); + BlockMeta meta(type, 2048 * (i + 1), 3600 + i * 100); + + keys.push_back(key); + meta_store_->put(key, meta); + } + + // Wait for async operations to complete + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Verify all records are present + for (int i = 0; i < num_records; ++i) { + auto result = meta_store_->get(keys[i]); + EXPECT_TRUE(result.has_value()); + EXPECT_EQ(static_cast(result->type), i % 3); + EXPECT_EQ(result->size, 2048 * (i + 1)); + EXPECT_EQ(result->ttl, 3600 + i * 100); + } + + // Clear all records + meta_store_->clear(); + + // Wait a bit for clear operation to complete + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify all records are gone + for (int i = 0; i < num_records; ++i) { + auto result = meta_store_->get(keys[i]); + EXPECT_FALSE(result.has_value()); + } + + // Verify range query returns no results + auto iterator = meta_store_->range_get(1); + ASSERT_NE(iterator, nullptr) << "Failed to create iterator for tablet 1"; + EXPECT_FALSE(iterator->valid()); +} + +TEST_F(CacheBlockMetaStoreTest, ClearWithPendingAsyncOperations) { + // Add some records and immediately call clear + // This tests that pending operations in the queue are handled correctly + + // Add a record + uint128_t hash1 = (static_cast(123) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + BlockMeta meta1(NORMAL, 1024, 3600); + meta_store_->put(key1, meta1); + + // Immediately clear without waiting for async operation + meta_store_->clear(); + + // Wait a bit for operations to complete + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify the record was not written (cleared from queue) + auto result = meta_store_->get(key1); + EXPECT_FALSE(result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, ClearAndThenAddNewRecords) { + // Test that after clear, the store can accept new records + + // Add initial records + uint128_t hash1 = (static_cast(123) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + BlockMeta meta1(NORMAL, 1024, 3600); + meta_store_->put(key1, meta1); + + // Wait for async operation + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Clear all records + meta_store_->clear(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Add new records after clear + uint128_t hash2 = (static_cast(789) << 64) | 123; + BlockMetaKey key2(2, UInt128Wrapper(hash2), 1024); + BlockMeta meta2(INDEX, 2048, 7200); + meta_store_->put(key2, meta2); + + // Wait for async operation + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify old record is gone + auto result1 = meta_store_->get(key1); + EXPECT_FALSE(result1.has_value()); + + // Verify new record is present + auto result2 = meta_store_->get(key2); + EXPECT_TRUE(result2.has_value()); + EXPECT_EQ(result2->type, INDEX); + EXPECT_EQ(result2->size, 2048); + EXPECT_EQ(result2->ttl, 7200); +} + +TEST_F(CacheBlockMetaStoreTest, ClearMultipleTimes) { + // Test that clear can be called multiple times without issues + + // Add a record + uint128_t hash = (static_cast(123) << 64) | 456; + BlockMetaKey key(1, UInt128Wrapper(hash), 0); + BlockMeta meta(NORMAL, 1024, 3600); + meta_store_->put(key, meta); + + // Wait for async operation + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Clear multiple times + meta_store_->clear(); + meta_store_->clear(); + meta_store_->clear(); + + // Wait for operations to complete + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify record is gone + auto result = meta_store_->get(key); + EXPECT_FALSE(result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, ClearEmptyStore) { + // Test clearing an empty store (should not crash or error) + + // Clear without adding any records + meta_store_->clear(); + + // Wait a bit + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify store is still functional + uint128_t hash = (static_cast(123) << 64) | 456; + BlockMetaKey key(1, UInt128Wrapper(hash), 0); + auto result = meta_store_->get(key); + EXPECT_FALSE(result.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, GetAllRecords) { + // Add records from multiple tablets + const int num_tablets = 3; + const int blocks_per_tablet = 4; + int total_records = num_tablets * blocks_per_tablet; + + for (int tablet_id = 1; tablet_id <= num_tablets; ++tablet_id) { + for (int i = 0; i < blocks_per_tablet; ++i) { + uint128_t hash = + (static_cast(tablet_id * 100 + i) << 64) | (tablet_id * 200 + i); + BlockMetaKey key(tablet_id, UInt128Wrapper(hash), i * 1024); + BlockMeta meta(static_cast(i % 2), 2048 * (i + 1), 3600 + i * 100); + meta_store_->put(key, meta); + } + } + + // Wait for async operations + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Test get_all method + auto iterator = meta_store_->get_all(); + ASSERT_TRUE(iterator != nullptr); + + int count = 0; + std::set tablet_ids_found; + std::set offsets_found; + + while (iterator->valid()) { + BlockMetaKey key = iterator->key(); + BlockMeta value = iterator->value(); + + // Verify key fields + EXPECT_GT(key.tablet_id, 0); + EXPECT_GE(key.offset, 0); + EXPECT_TRUE(key.hash.value_ > 0); + + // Verify value fields + EXPECT_TRUE(value.type == 0 || value.type == 1); + EXPECT_GT(value.size, 0); + EXPECT_GT(value.ttl, 0); + + // Track what we found + tablet_ids_found.insert(key.tablet_id); + offsets_found.insert(key.offset); + + iterator->next(); + count++; + } + + // Verify we found all records + EXPECT_EQ(count, total_records); + + // Verify we found records from all tablets + EXPECT_EQ(tablet_ids_found.size(), num_tablets); + for (int tablet_id = 1; tablet_id <= num_tablets; ++tablet_id) { + EXPECT_TRUE(tablet_ids_found.find(tablet_id) != tablet_ids_found.end()); + } + + // Verify we found various offsets + EXPECT_GE(offsets_found.size(), blocks_per_tablet); +} + +TEST_F(CacheBlockMetaStoreTest, GetAllEmptyStore) { + // Test get_all on empty store + auto iterator = meta_store_->get_all(); + ASSERT_TRUE(iterator != nullptr); + + // Should be invalid immediately + EXPECT_FALSE(iterator->valid()); + + // Calling next should not crash + iterator->next(); + EXPECT_FALSE(iterator->valid()); +} + +TEST_F(CacheBlockMetaStoreTest, GetAllAfterClear) { + // Add some records + for (int i = 0; i < 5; ++i) { + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + BlockMeta meta(static_cast(i % 2), 2048 * (i + 1), 3600 + i * 100); + meta_store_->put(key, meta); + } + + // Wait for async operations with more reliable mechanism + // Check that all records are actually written by querying each one + int max_retries = 10; + int successful_checks = 0; + for (int retry = 0; retry < max_retries; ++retry) { + successful_checks = 0; + for (int i = 0; i < 5; ++i) { + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + auto result = meta_store_->get(key); + if (result.has_value()) { + successful_checks++; + } + } + if (successful_checks == 5) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // Verify all records are present using get_all() + auto iterator1 = meta_store_->get_all(); + int count_before = 0; + while (iterator1->valid()) { + count_before++; + iterator1->next(); + } + EXPECT_EQ(count_before, 5) << "Expected 5 records but found " << count_before; + + // Clear all records + meta_store_->clear(); + + // Wait for clear operation to complete with verification + max_retries = 10; + for (int retry = 0; retry < max_retries; ++retry) { + successful_checks = 0; + for (int i = 0; i < 5; ++i) { + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + auto result = meta_store_->get(key); + if (!result.has_value()) { + successful_checks++; + } + } + if (successful_checks == 5) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // Verify no records after clear using get_all() + auto iterator2 = meta_store_->get_all(); + int count_after = 0; + while (iterator2->valid()) { + count_after++; + iterator2->next(); + } + EXPECT_EQ(count_after, 0) << "Expected 0 records after clear but found " << count_after; +} + +TEST_F(CacheBlockMetaStoreTest, GetAllIteratorValidity) { + // Add multiple records + for (int i = 0; i < 10; ++i) { + uint128_t hash = (static_cast(100 + i) << 64) | (200 + i); + BlockMetaKey key(1, UInt128Wrapper(hash), i * 1024); + BlockMeta meta(static_cast(i % 3), 2048 * (i + 1), 3600 + i * 100); + meta_store_->put(key, meta); + } + + // Wait for async operations + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Test iterator validity and navigation + auto iterator = meta_store_->get_all(); + ASSERT_TRUE(iterator != nullptr); + + int count = 0; + while (iterator->valid()) { + BlockMetaKey key = iterator->key(); + BlockMeta value = iterator->value(); + + // Verify consistency between key and value + EXPECT_GE(key.offset, 0); + EXPECT_GT(value.size, 0); + + iterator->next(); + ++count; + } + + EXPECT_EQ(count, 10); + EXPECT_FALSE(iterator->valid()); +} + +TEST_F(CacheBlockMetaStoreTest, MultipleOperationsSameKey) { + uint128_t hash1 = (static_cast(123) << 64) | 456; + BlockMetaKey key1(1, UInt128Wrapper(hash1), 0); + + // Multiple operations on same key + BlockMeta meta1(FileCacheType::NORMAL, 1024, 3600); + BlockMeta meta2(FileCacheType::INDEX, 2048, 7200); + BlockMeta meta3(FileCacheType::TTL, 4096, 10800); + + // Put first value + meta_store_->put(key1, meta1); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Immediately query - should find first value + auto result1 = meta_store_->get(key1); + EXPECT_TRUE(result1.has_value()); + EXPECT_EQ(result1->type, 1); + + // Put second value (should override first in queue) + meta_store_->put(key1, meta2); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Immediately query - should find second value + auto result2 = meta_store_->get(key1); + EXPECT_TRUE(result2.has_value()); + EXPECT_EQ(result2->type, 2); + + // Put third value (should override second in queue) + meta_store_->put(key1, meta3); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Immediately query - should find third value + auto result3 = meta_store_->get(key1); + EXPECT_TRUE(result3.has_value()); + EXPECT_EQ(result3->type, 3); + + // Delete operation (should override all puts in queue) + meta_store_->delete_key(key1); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Immediately query - should find delete operation + auto result4 = meta_store_->get(key1); + EXPECT_FALSE(result4.has_value()); +} + +TEST_F(CacheBlockMetaStoreTest, ErrorHandling) { + // Test error handling in deserialization functions + + // Test deserialize_key with invalid data + Status status; + auto invalid_key_result = deserialize_key("invalid_key_data", &status); + EXPECT_FALSE(invalid_key_result.has_value()); + EXPECT_FALSE(status.ok()); + EXPECT_TRUE(status.to_string().find("Failed to decode") != std::string::npos); + + // Test deserialize_value with invalid data + auto invalid_value_result = deserialize_value(std::string("invalid_value_data"), &status); + EXPECT_FALSE(invalid_value_result.has_value()); + EXPECT_FALSE(status.ok()); + EXPECT_TRUE(status.to_string().find("Failed to deserialize value") != std::string::npos); + + // Test deserialize_value with empty string_view + std::string_view empty_view; + auto empty_value_result = deserialize_value(empty_view, &status); + EXPECT_FALSE(empty_value_result.has_value()); + EXPECT_FALSE(status.ok()); + + // Test successful deserialization + uint128_t hash = (static_cast(123) << 64) | 456; + BlockMetaKey valid_key(1, UInt128Wrapper(hash), 1024); + std::string valid_key_str = serialize_key(valid_key); + auto valid_key_result = deserialize_key(valid_key_str, &status); + EXPECT_TRUE(valid_key_result.has_value()); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(valid_key_result->tablet_id, 1); + EXPECT_EQ(valid_key_result->hash, valid_key.hash); + EXPECT_EQ(valid_key_result->offset, 1024); + + // Test successful value deserialization + BlockMeta valid_meta(NORMAL, 2048, 3600); + std::string valid_meta_str = serialize_value(valid_meta); + auto valid_meta_result = deserialize_value(valid_meta_str, &status); + EXPECT_TRUE(valid_meta_result.has_value()); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(valid_meta_result->type, NORMAL); + EXPECT_EQ(valid_meta_result->size, 2048); + EXPECT_EQ(valid_meta_result->ttl, 3600); +} + +TEST_F(CacheBlockMetaStoreTest, IteratorErrorHandling) { + // Test error handling in iterators by manually inserting corrupted data + + // Create a valid key and meta first + uint128_t hash = (static_cast(123) << 64) | 456; + BlockMetaKey valid_key(1, UInt128Wrapper(hash), 1024); + BlockMeta valid_meta(NORMAL, 2048, 3600); + + // Put valid data + meta_store_->put(valid_key, valid_meta); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Test range_get iterator error handling + auto iterator = meta_store_->range_get(1); + ASSERT_NE(iterator, nullptr); + + // Iterate through valid data first + int count = 0; + while (iterator->valid()) { + (void)iterator->key(); // unused variable + (void)iterator->value(); // unused variable + + // Check that no errors occurred + EXPECT_TRUE(iterator->get_last_key_error().ok()); + EXPECT_TRUE(iterator->get_last_value_error().ok()); + + iterator->next(); + count++; + } + + EXPECT_EQ(count, 1); + + // Test get_all iterator + auto all_iterator = meta_store_->get_all(); + ASSERT_NE(all_iterator, nullptr); + + count = 0; + while (all_iterator->valid()) { + (void)all_iterator->key(); // unused variable + (void)all_iterator->value(); // unused variable + + // Check that no errors occurred + EXPECT_TRUE(all_iterator->get_last_key_error().ok()); + EXPECT_TRUE(all_iterator->get_last_value_error().ok()); + + all_iterator->next(); + count++; + } + + EXPECT_EQ(count, 1); +} + +} // namespace doris::io \ No newline at end of file diff --git a/gensrc/proto/file_cache.proto b/gensrc/proto/file_cache.proto index f11375586aa16c..d6c5c3c0e8c46b 100644 --- a/gensrc/proto/file_cache.proto +++ b/gensrc/proto/file_cache.proto @@ -47,3 +47,17 @@ message LRUDumpMetaPb{ repeated EntryGroupOffsetSizePb group_offset_size = 3; } +// corresponding to FileCacheType in file_cache_common.h +enum FileCacheType { + DISPOSABLE = 0; + NORMAL = 1; + INDEX = 2; + TTL = 3; +} + +message BlockMetaPb { + optional FileCacheType type = 1; + optional uint64 size = 2; + optional uint64 ttl = 3; +} + From 544bf296a5a0ad547bf5f99764ecebdc23d7813e Mon Sep 17 00:00:00 2001 From: zhengyu Date: Thu, 13 Nov 2025 13:36:14 +0800 Subject: [PATCH 02/20] [enhancement](filecache) fine-grained cache space observation (#57783) --- be/src/exec/schema_scanner.cpp | 3 + .../schema_file_cache_info_scanner.cpp | 189 ++++++++++++++++++ .../schema_file_cache_info_scanner.h | 47 +++++ be/src/io/cache/block_file_cache.h | 3 + be/src/io/cache/block_file_cache_factory.h | 3 + .../doris/analysis/SchemaTableType.java | 2 + .../org/apache/doris/catalog/SchemaTable.java | 11 + gensrc/thrift/Descriptors.thrift | 7 +- .../cache/test_file_cache_info.groovy | 129 ++++++++++++ 9 files changed, 391 insertions(+), 3 deletions(-) create mode 100644 be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp create mode 100644 be/src/exec/schema_scanner/schema_file_cache_info_scanner.h create mode 100644 regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index 92c29ab6ace6d3..418c78b1652946 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -39,6 +39,7 @@ #include "exec/schema_scanner/schema_columns_scanner.h" #include "exec/schema_scanner/schema_dummy_scanner.h" #include "exec/schema_scanner/schema_encryption_keys_scanner.h" +#include "exec/schema_scanner/schema_file_cache_info_scanner.h" #include "exec/schema_scanner/schema_file_cache_statistics.h" #include "exec/schema_scanner/schema_files_scanner.h" #include "exec/schema_scanner/schema_load_job_scanner.h" @@ -258,6 +259,8 @@ std::unique_ptr SchemaScanner::create(TSchemaTableType::type type return SchemaClusterSnapshotPropertiesScanner::create_unique(); case TSchemaTableType::SCH_COLUMN_DATA_SIZES: return SchemaColumnDataSizesScanner::create_unique(); + case TSchemaTableType::SCH_FILE_CACHE_INFO: + return SchemaFileCacheInfoScanner::create_unique(); default: return SchemaDummyScanner::create_unique(); break; diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp new file mode 100644 index 00000000000000..9734dbfe44bbf7 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_file_cache_info_scanner.h" + +#include "io/cache/file_cache_common.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { +#include "common/compile_check_begin.h" + +std::vector SchemaFileCacheInfoScanner::_s_tbls_columns = { + // name, type, size, is_null + {"HASH", TYPE_STRING, sizeof(StringRef), true}, + {"TABLET_ID", TYPE_BIGINT, sizeof(int64_t), true}, + {"SIZE", TYPE_BIGINT, sizeof(int64_t), true}, + {"TYPE", TYPE_STRING, sizeof(StringRef), true}, + {"REMOTE_PATH", TYPE_STRING, sizeof(StringRef), true}, + {"CACHE_PATH", TYPE_STRING, sizeof(StringRef), true}, + {"BE_ID", TYPE_BIGINT, sizeof(int64_t), true}}; + +SchemaFileCacheInfoScanner::SchemaFileCacheInfoScanner() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_FILE_CACHE_INFO) {} + +SchemaFileCacheInfoScanner::~SchemaFileCacheInfoScanner() {} + +Status SchemaFileCacheInfoScanner::start(RuntimeState* state) { + return Status::OK(); +} + +Status SchemaFileCacheInfoScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + *eos = true; + return _fill_block_impl(block); +} + +Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) { + SCOPED_TIMER(_fill_block_timer); + + auto* file_cache_factory = ExecEnv::GetInstance()->file_cache_factory(); + if (!file_cache_factory) { + return Status::OK(); + } + + // Collect all cache entries from all file cache instances + std::vector> cache_entries; + + // Get all cache instances using the public getter + const auto& caches = file_cache_factory->get_caches(); + for (const auto& cache : caches) { + const std::string& cache_path = cache->get_base_path(); + + // Get the storage from cache using the public getter + auto* storage = cache->get_storage(); + if (!storage) { + continue; + } + + // Try to get meta_store from FSFileCacheStorage using the public getter + auto* fs_storage = dynamic_cast(storage); + if (!fs_storage) { + continue; + } + + auto* meta_store = fs_storage->get_meta_store(); + if (!meta_store) { + continue; + } + + // Get iterator for all BlockMeta records + auto iterator = meta_store->get_all(); + if (!iterator) { + continue; + } + + // Iterate through all cache entries + while (iterator->valid()) { + const auto& key = iterator->key(); + const auto& value = iterator->value(); + + // Check for deserialization errors + if (!iterator->get_last_key_error().ok() || !iterator->get_last_value_error().ok()) { + LOG(WARNING) << "Failed to deserialize cache block metadata: " + << "key_error=" << iterator->get_last_key_error().to_string() + << ", value_error=" << iterator->get_last_value_error().to_string(); + iterator->next(); + continue; // Skip invalid records + } + + // Convert hash to string + std::string hash_str = key.hash.to_string(); + + // Add to cache entries + cache_entries.emplace_back(hash_str, key.tablet_id, value.size, value.type, cache_path); + + iterator->next(); + } + } + + const size_t row_num = cache_entries.size(); + if (row_num == 0) { + return Status::OK(); + } + + for (size_t col_idx = 0; col_idx < _s_tbls_columns.size(); ++col_idx) { + const auto& col_desc = _s_tbls_columns[col_idx]; + + std::vector str_refs(row_num); + std::vector int64_vals(row_num); + std::vector datas(row_num); + std::vector column_values(row_num); + + for (size_t row_idx = 0; row_idx < row_num; ++row_idx) { + const auto& entry = cache_entries[row_idx]; + const auto& [hash, tablet_id, size, type, cache_path] = entry; + + if (col_desc.type == TYPE_STRING) { + switch (col_idx) { + case 0: // HASH + column_values[row_idx] = hash; + break; + case 3: // TYPE + column_values[row_idx] = doris::io::cache_type_to_string( + static_cast(type)); + break; + case 4: // REMOTE_PATH + column_values[row_idx] = ""; // TODO: Implement remote path retrieval + break; + case 5: // CACHE_PATH + column_values[row_idx] = cache_path; + break; + default: + column_values[row_idx] = ""; + break; + } + str_refs[row_idx] = + StringRef(column_values[row_idx].data(), column_values[row_idx].size()); + datas[row_idx] = &str_refs[row_idx]; + } else if (col_desc.type == TYPE_BIGINT) { + switch (col_idx) { + case 1: // TABLET_ID + int64_vals[row_idx] = tablet_id; + break; + case 2: // SIZE + int64_vals[row_idx] = size; + break; + case 6: // BE_ID + int64_vals[row_idx] = ExecEnv::GetInstance()->cluster_info()->backend_id; + break; + default: + int64_vals[row_idx] = 0; + break; + } + datas[row_idx] = &int64_vals[row_idx]; + } + } + + RETURN_IF_ERROR(fill_dest_column_for_range(block, col_idx, datas)); + } + + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h new file mode 100644 index 00000000000000..2efe6f76cdfc94 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaFileCacheInfoScanner : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaFileCacheInfoScanner); + +public: + SchemaFileCacheInfoScanner(); + ~SchemaFileCacheInfoScanner() override; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + Status _fill_block_impl(vectorized::Block* block); +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index ef1df34b26beaa..83bc7a831a3226 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -214,6 +214,9 @@ class BlockFileCache { [[nodiscard]] const std::string& get_base_path() const { return _cache_base_path; } + // Get storage for inspection + FileCacheStorage* get_storage() const { return _storage.get(); } + /** * Given an `offset` and `size` representing [offset, offset + size) bytes interval, * return list of cached non-overlapping non-empty diff --git a/be/src/io/cache/block_file_cache_factory.h b/be/src/io/cache/block_file_cache_factory.h index 837feac7f68543..3031076336c3ef 100644 --- a/be/src/io/cache/block_file_cache_factory.h +++ b/be/src/io/cache/block_file_cache_factory.h @@ -102,6 +102,9 @@ class FileCacheFactory { void get_cache_stats_block(vectorized::Block* block); + // Get all cache instances for inspection + const std::vector>& get_caches() const { return _caches; } + FileCacheFactory() = default; FileCacheFactory& operator=(const FileCacheFactory&) = delete; FileCacheFactory(const FileCacheFactory&) = delete; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java index 5576c6d294be9d..ca58b311ec4a2c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java @@ -98,6 +98,8 @@ public enum SchemaTableType { TSchemaTableType.SCH_ROUTINE_LOAD_JOBS), SCH_LOAD_JOBS("LOAD_JOBS", "LOAD_JOBS", TSchemaTableType.SCH_LOAD_JOBS), + SCH_FILE_CACHE_INFO("FILE_CACHE_INFO", "FILE_CACHE_INFO", + TSchemaTableType.SCH_FILE_CACHE_INFO), SCH_VIEW_DEPENDENCY("VIEW_DEPENDENCY", "VIEW_DEPENDENCY", TSchemaTableType.SCH_VIEW_DEPENDENCY), SQL_BLOCK_RULE_STATUS("SQL_BLOCK_RULE_STATUS", "SQL_BLOCK_RULE_STATUS", diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java index e9ad892ddd73c6..eca50ab9de6c82 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java @@ -698,6 +698,17 @@ public class SchemaTable extends Table { .column("FIRST_ERROR_MSG", ScalarType.createStringType()) .build()) ) + .put("file_cache_info", + new SchemaTable(SystemIdGenerator.getNextId(), "file_cache_info", TableType.SCHEMA, + builder().column("HASH", ScalarType.createStringType()) + .column("TABLET_ID", ScalarType.createType(PrimitiveType.BIGINT)) + .column("SIZE", ScalarType.createType(PrimitiveType.BIGINT)) + .column("TYPE", ScalarType.createStringType()) + .column("REMOTE_PATH", ScalarType.createStringType()) + .column("CACHE_PATH", ScalarType.createStringType()) + .column("BE_ID", ScalarType.createType(PrimitiveType.BIGINT)) + .build()) + ) .put("backend_tablets", new SchemaTable(SystemIdGenerator.getNextId(), "backend_tablets", TableType.SCHEMA, builder().column("BE_ID", ScalarType.createType(PrimitiveType.BIGINT)) .column("TABLET_ID", ScalarType.createType(PrimitiveType.BIGINT)) diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 411af7b54820b5..165b1a1b80bdce 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -205,9 +205,10 @@ enum TSchemaTableType { SCH_SQL_BLOCK_RULE_STATUS = 59; SCH_CLUSTER_SNAPSHOTS = 60; SCH_CLUSTER_SNAPSHOT_PROPERTIES = 61; - SCH_BLACKHOLE = 62; - SCH_COLUMN_DATA_SIZES = 63; - SCH_LOAD_JOBS = 64; + SCH_FILE_CACHE_INFO = 62; + SCH_BLACKHOLE = 63; + SCH_COLUMN_DATA_SIZES = 64; + SCH_LOAD_JOBS = 65; } enum THdfsCompression { diff --git a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy new file mode 100644 index 00000000000000..f018889bb7adf5 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_file_cache_info") { + def custoBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99 + ] + + setBeConfigTemporary(custoBeConfig) { + + String[][] backends = sql """ show backends """ + def backendSockets = [] + def backendIdToBackendIP = [:] + def backendIdToBackendHttpPort = [:] + for (String[] backend in backends) { + if (backend[9].equals("true")) { + backendIdToBackendIP.put(backend[0], backend[1]) + backendIdToBackendHttpPort.put(backend[0], backend[4]) + } + } + assertTrue(backendIdToBackendIP.size() > 0, "No alive backends found") + + backendIdToBackendIP.each { backendId, ip -> + def socket = ip + ":" + backendIdToBackendHttpPort.get(backendId) + backendSockets.add(socket) + } + + sql "drop table IF EXISTS customer" + + sql """ + CREATE TABLE IF NOT EXISTS customer ( + `c_custkey` int NULL, + `c_name` string NULL, + `c_address` string NULL, + `c_city` string NULL, + `c_nation` string NULL, + `c_region` string NULL, + `c_phone` string NULL, + `c_mktsegment` string NULL + ) + DUPLICATE KEY(`c_custkey`) + DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 1 // only 1 tablet + PROPERTIES ( + "file_cache_ttl_seconds" = "3600" + ) + """ + + sql """ + insert into customer values + (1, 'Customer#000000001', 'address1', 'city1', 'nation1', 'region1', 'phone1', 'segment1'), + (2, 'Customer#000000002', 'address2', 'city2', 'nation2', 'region2', 'phone2', 'segment2'), + (3, 'Customer#000000003', 'address3', 'city3', 'nation3', 'region3', 'phone3', 'segment3'), + (4, 'Customer#000000004', 'address4', 'city4', 'nation4', 'region4', 'phone4', 'segment4'), + (5, 'Customer#000000005', 'address5', 'city5', 'nation5', 'region5', 'phone5', 'segment5') + """ + sql "sync" + + sql "select count(*) from customer" + + Thread.sleep(10000) + + def get_tablet_id = { String tbl_name -> + def tablets = sql "show tablets from ${tbl_name}" + assertEquals(tablets.size(), 1, "Should have exactly one tablet with BUCKETS=1") + return tablets[0][0] as Long + } + + def tablet_id = get_tablet_id("customer") + println "Tablet ID: ${tablet_id}" + + def cache_info = sql "select * from information_schema.file_cache_info" + + assertTrue(cache_info.size() > 0, "file_cache_info should not be empty for tablet_id ${tablet_id}") + + println "First query - File cache info for tablet_id ${tablet_id}:" + cache_info.each { row -> + println " ${row}" + } + + def clearResults = [] + backendSockets.each { socket -> + httpTest { + endpoint "" + uri socket + "/api/file_cache?op=clear&sync=true" + op "get" + check {respCode, body -> + assertEquals(respCode, 200, "clear local cache fail, maybe you can find something in respond: " + parseJson(body)) + clearResults.add(true) + } + } + } + assertEquals(clearResults.size(), backendSockets.size(), "Failed to clear cache on some backends") + + Thread.sleep(5000) + + def cache_info_after_clear = sql "select * from information_schema.file_cache_info where tablet_id = ${tablet_id}" + assertEquals(cache_info_after_clear.size(), 0, "file_cache_info should be empty after clearing cache") + + println "After clearing cache - File cache info is empty as expected" + + sql "select * from customer" + + Thread.sleep(10000) + + def cache_info_reloaded = sql "select * from information_schema.file_cache_info where tablet_id = ${tablet_id}" + assertTrue(cache_info_reloaded.size() > 0, "file_cache_info should not be empty after reloading data") + + println "After reloading data - File cache info for tablet_id ${tablet_id}:" + cache_info_reloaded.each { row -> + println " ${row}" + } + + } +} \ No newline at end of file From 4f2aaa027278a196794f4dabf42f2c1884c5b358 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Fri, 14 Nov 2025 20:05:55 +0800 Subject: [PATCH 03/20] fix after rebase Signed-off-by: zhengyu --- be/src/io/cache/cached_remote_file_reader.cpp | 1 - be/src/io/cache/file_cache_common.cpp | 28 ------------------- be/src/io/cache/file_cache_common.h | 11 ++++++++ be/src/io/cache/file_cache_storage.h | 4 +++ be/src/io/cache/fs_file_cache_storage.h | 3 ++ 5 files changed, 18 insertions(+), 29 deletions(-) diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index dc8fd795dde382..d095e859badbcd 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -43,7 +43,6 @@ #include "io/cache/block_file_cache_profile.h" #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" -#include "io/cache/peer_file_cache_reader.h" #include "io/fs/file_reader.h" #include "io/fs/local_file_system.h" #include "io/io_common.h" diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 85af8b330a190d..6bfd355b72cc7b 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -155,34 +155,6 @@ std::string FileCacheInfo::to_string() const { return ss.str(); } -std::string InconsistencyType::to_string() const { - std::string result = "Inconsistency Reason: "; - if (type == NONE) { - result += "NONE"; - } else { - if (type & NOT_LOADED) { - result += "NOT_LOADED "; - } - if (type & MISSING_IN_STORAGE) { - result += "MISSING_IN_STORAGE "; - } - if (type & SIZE_INCONSISTENT) { - result += "SIZE_INCONSISTENT "; - } - if (type & CACHE_TYPE_INCONSISTENT) { - result += "CACHE_TYPE_INCONSISTENT "; - } - if (type & EXPIRATION_TIME_INCONSISTENT) { - result += "EXPIRATION_TIME_INCONSISTENT "; - } - if (type & TMP_FILE_EXPECT_DOWNLOADING_STATE) { - result += "TMP_FILE_EXPECT_DOWNLOADING_STATE"; - } - } - result += "\n"; - return result; -} - std::optional get_tablet_id(std::string file_path) { // Expected path formats: // support both .dat and .idx file extensions diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 9181c61c55d995..02b759efbca728 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -263,6 +263,17 @@ class LRUQueue { int64_t hot_data_interval {0}; }; +struct FileCacheInfo { + UInt128Wrapper hash {0}; + uint64_t expiration_time {0}; + uint64_t size {0}; + size_t offset {0}; + bool is_tmp {false}; + FileCacheType cache_type {NORMAL}; + + std::string to_string() const; +}; + std::optional get_tablet_id(std::string file_path); } // namespace doris::io diff --git a/be/src/io/cache/file_cache_storage.h b/be/src/io/cache/file_cache_storage.h index e3c0c0c9aea583..226001c7109afe 100644 --- a/be/src/io/cache/file_cache_storage.h +++ b/be/src/io/cache/file_cache_storage.h @@ -68,6 +68,10 @@ class FileCacheStorage { virtual FileCacheStorageType get_type() = 0; // get local cached file virtual std::string get_local_file(const FileCacheKey& key) = 0; + virtual Status get_file_cache_infos(std::vector& infos, + std::lock_guard& cache_lock) const { + return Status::OK(); + }; }; } // namespace doris::io diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index daf0acef82d67c..4a45726186c1ca 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -119,6 +119,9 @@ class FSFileCacheStorage : public FileCacheStorage { void load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const; void load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const; + Status get_file_cache_infos(std::vector& infos, + std::lock_guard& cache_lock) const override; + std::string _cache_base_path; std::thread _cache_background_load_thread; const std::shared_ptr& fs = global_local_filesystem(); From 1df56c19799277b83cfd9b1511d8ec67aa0cd9cd Mon Sep 17 00:00:00 2001 From: zhengyu Date: Mon, 17 Nov 2025 14:49:20 +0800 Subject: [PATCH 04/20] [fix](filecache) fix out-of-range exception when external query add check for external data (lakehouse data) when get_tablet_id to avoid out-of-range exception Signed-off-by: zhengyu --- be/src/io/cache/file_cache_common.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 6bfd355b72cc7b..5e58052d7ce12f 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -172,6 +172,10 @@ std::optional get_tablet_id(std::string file_path) { return std::nullopt; } + if (data_prefix.length() + data_pos >= path_view.length()) { + return std::nullopt; + } + // Extract the part after "data/" path_view = path_view.substr(data_pos + data_prefix.length() + 1); From a0fd5ab43c52219a6a7eca8ad550d84bdc509879 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Mon, 17 Nov 2025 16:26:37 +0800 Subject: [PATCH 05/20] add tests Signed-off-by: zhengyu --- be/test/io/cache/block_file_cache_test.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index f681d31a9af708..568539bd5197a2 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -19,6 +19,8 @@ // and modified by Doris #include "block_file_cache_test_common.h" +#include "olap/olap_define.h" + namespace doris::io { fs::path caches_dir = fs::current_path() / "lru_cache_test"; @@ -7415,6 +7417,13 @@ TEST_F(BlockFileCacheTest, reader_dryrun_when_download_file_cache) { config::enable_reader_dryrun_when_download_file_cache = org; } +TEST_F(BlockFileCacheTest, cached_remote_file_reader_tablet_id_guard) { + // Ensure get_tablet_id gracefully returns nullopt + std::string fake_path = "/mnt/data"; + auto tablet_id = get_tablet_id(fake_path); + EXPECT_FALSE(tablet_id.has_value()); +} + void move_dir_to_version1(const std::string& dirPath) { try { // layer 1 From 50eff5642200198adc1f1d459adc701fd75e5ad4 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Mon, 17 Nov 2025 16:29:08 +0800 Subject: [PATCH 06/20] fix regression Signed-off-by: zhengyu --- .../suites/cloud_p0/cache/test_file_cache_info.groovy | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy index f018889bb7adf5..a7e977a048d734 100644 --- a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy +++ b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy @@ -54,7 +54,7 @@ suite("test_file_cache_info") { `c_mktsegment` string NULL ) DUPLICATE KEY(`c_custkey`) - DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 1 // only 1 tablet + DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 1 PROPERTIES ( "file_cache_ttl_seconds" = "3600" ) @@ -126,4 +126,5 @@ suite("test_file_cache_info") { } } -} \ No newline at end of file +} + From 54043744efd73fe923924d84514db3e3d8b04be2 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 9 Dec 2025 09:27:09 +0800 Subject: [PATCH 07/20] [refactor](filecache) ttl management refactoring --- be/src/cloud/cloud_internal_service.cpp | 8 +- be/src/cloud/cloud_storage_engine.cpp | 29 +- be/src/cloud/cloud_storage_engine.h | 19 +- be/src/cloud/cloud_tablet.cpp | 13 +- be/src/cloud/cloud_tablet_mgr.cpp | 47 ++- be/src/cloud/cloud_tablet_mgr.h | 15 +- be/src/cloud/cloud_warm_up_manager.cpp | 8 +- be/src/common/config.cpp | 4 +- be/src/common/config.h | 2 + be/src/io/cache/block_file_cache.cpp | 365 +++--------------- be/src/io/cache/block_file_cache.h | 18 +- be/src/io/cache/block_file_cache_ttl_mgr.cpp | 261 +++++++++++++ be/src/io/cache/block_file_cache_ttl_mgr.h | 78 ++++ be/src/io/cache/cache_block_meta_store.cpp | 5 + be/src/io/cache/cache_block_meta_store.h | 1 + be/src/io/cache/cached_remote_file_reader.cpp | 1 + be/src/io/cache/file_cache_common.cpp | 1 + be/src/io/cache/fs_file_cache_storage.cpp | 8 +- be/src/io/fs/file_writer.h | 4 +- be/src/io/tools/file_cache_microbench.cpp | 2 +- be/src/olap/rowset/beta_rowset_reader.cpp | 8 +- be/src/olap/rowset/rowset_writer_context.h | 11 +- be/src/olap/storage_engine.cpp | 17 +- be/src/olap/storage_engine.h | 24 +- be/src/runtime/exec_env.cpp | 14 +- be/src/runtime/exec_env.h | 13 +- be/test/io/cache/block_file_cache_test.cpp | 107 ++--- .../cache/block_file_cache_test_lru_dump.cpp | 2 +- .../block_file_cache_test_meta_store.cpp | 2 +- .../cache/block_file_cache_ttl_mgr_test.cpp | 348 +++++++++++++++++ .../data/cloud_p0/cache/ttl/test_show_ttl.out | 4 +- .../cloud_p0/cache/ttl/alter_ttl_1.groovy | 92 +++-- .../cloud_p0/cache/ttl/alter_ttl_2.groovy | 92 +++-- .../cloud_p0/cache/ttl/alter_ttl_3.groovy | 92 +++-- .../cache/ttl/alter_ttl_seconds.groovy | 105 +++-- .../cache/ttl/create_table_as_select.groovy | 5 +- .../cache/ttl/create_table_like.groovy | 5 +- .../suites/cloud_p0/cache/ttl/test_ttl.groovy | 100 +++-- .../cache/ttl/test_ttl_lru_evict.groovy | 12 + ..._clean_tablet_when_drop_force_table.groovy | 24 ++ 40 files changed, 1343 insertions(+), 623 deletions(-) create mode 100644 be/src/io/cache/block_file_cache_ttl_mgr.cpp create mode 100644 be/src/io/cache/block_file_cache_ttl_mgr.h create mode 100644 be/test/io/cache/block_file_cache_ttl_mgr_test.cpp diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 2584ce8146b534..ce979079403aac 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -425,13 +425,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c << " us, tablet_id: " << rs_meta.tablet_id() << ", rowset_id: " << rowset_id.to_string(); } - int64_t expiration_time = - tablet_meta->ttl_seconds() == 0 || rs_meta.newest_write_timestamp() <= 0 - ? 0 - : rs_meta.newest_write_timestamp() + tablet_meta->ttl_seconds(); - if (expiration_time <= UnixSeconds()) { - expiration_time = 0; - } + int64_t expiration_time = tablet_meta->ttl_seconds(); if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) { LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 8529fb0ed92572..077c1af26e0495 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -280,11 +280,36 @@ bool CloudStorageEngine::stopped() { Result CloudStorageEngine::get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats, - bool force_use_only_cached) { - return _tablet_mgr->get_tablet(tablet_id, false, true, sync_stats, force_use_only_cached) + bool force_use_only_cached, + bool cache_on_miss) { + return _tablet_mgr + ->get_tablet(tablet_id, false, true, sync_stats, force_use_only_cached, cache_on_miss) .transform([](auto&& t) { return static_pointer_cast(std::move(t)); }); } +Status CloudStorageEngine::get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached) { + if (tablet_meta == nullptr) { + return Status::InvalidArgument("tablet_meta output is null"); + } + +#if 0 + if (_tablet_mgr && _tablet_mgr->peek_tablet_meta(tablet_id, tablet_meta)) { + return Status::OK(); + } + + if (force_use_only_cached) { + return Status::NotFound("tablet meta {} not found in cache", tablet_id); + } +#endif + + if (_meta_mgr == nullptr) { + return Status::InternalError("cloud meta manager is not initialized"); + } + + return _meta_mgr->get_tablet_meta(tablet_id, tablet_meta); +} + Status CloudStorageEngine::start_bg_threads(std::shared_ptr wg_sptr) { RETURN_IF_ERROR(Thread::create( "CloudStorageEngine", "refresh_s3_info_thread", diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 0b61fe2076200d..e83b8808d17d2f 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -62,8 +62,25 @@ class CloudStorageEngine final : public BaseStorageEngine { void stop() override; bool stopped() override; + /* Parameters: + * - tablet_id: the id of tablet to get + * - sync_stats: the stats of sync rowset + * - force_use_only_cached: whether only use cached tablet meta + * - cache_on_miss: whether cache the tablet meta when missing in cache + */ Result get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats = nullptr, - bool force_use_only_cached = false) override; + bool force_use_only_cached = false, + bool cache_on_miss = true) override; + + /* + * Get the tablet meta for a specific tablet + * Parameters: + * - tablet_id: the id of tablet to get meta for + * - tablet_meta: output TabletMeta shared pointer + * - force_use_only_cached: whether only use cached tablet meta (return NotFound on miss) + */ + Status get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached = false) override; Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) override; diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index ea5f52366abbe9..05950e1a3a2d61 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -404,12 +404,7 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ continue; } - int64_t expiration_time = - _tablet_meta->ttl_seconds() == 0 || - rowset_meta->newest_write_timestamp() <= 0 - ? 0 - : rowset_meta->newest_write_timestamp() + - _tablet_meta->ttl_seconds(); + int64_t expiration_time = _tablet_meta->ttl_seconds(); g_file_cache_cloud_tablet_submitted_segment_num << 1; if (rs->rowset_meta()->segment_file_size(seg_id) > 0) { g_file_cache_cloud_tablet_submitted_segment_size @@ -1533,16 +1528,12 @@ Status CloudTablet::sync_meta() { auto new_ttl_seconds = tablet_meta->ttl_seconds(); if (_tablet_meta->ttl_seconds() != new_ttl_seconds) { _tablet_meta->set_ttl_seconds(new_ttl_seconds); - int64_t cur_time = UnixSeconds(); std::shared_lock rlock(_meta_lock); for (auto& [_, rs] : _rs_version_map) { for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { - int64_t new_expiration_time = - new_ttl_seconds + rs->rowset_meta()->newest_write_timestamp(); - new_expiration_time = new_expiration_time > cur_time ? new_expiration_time : 0; auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); - file_cache->modify_expiration_time(file_key, new_expiration_time); + file_cache->modify_expiration_time(file_key, new_ttl_seconds); } } } diff --git a/be/src/cloud/cloud_tablet_mgr.cpp b/be/src/cloud/cloud_tablet_mgr.cpp index a3996d4e61eccd..41d68167eaf738 100644 --- a/be/src/cloud/cloud_tablet_mgr.cpp +++ b/be/src/cloud/cloud_tablet_mgr.cpp @@ -160,7 +160,8 @@ void set_tablet_access_time_ms(CloudTablet* tablet) { Result> CloudTabletMgr::get_tablet(int64_t tablet_id, bool warmup_data, bool sync_delete_bitmap, SyncRowsetStats* sync_stats, - bool force_use_only_cached) { + bool force_use_only_cached, + bool cache_on_miss) { // LRU value type. `Value`'s lifetime MUST NOT be longer than `CloudTabletMgr` class Value : public LRUCacheValueBase { public: @@ -193,7 +194,7 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i if (sync_stats) { ++sync_stats->tablet_meta_cache_miss; } - auto load_tablet = [this, &key, warmup_data, sync_delete_bitmap, + auto load_tablet = [this, warmup_data, sync_delete_bitmap, sync_stats](int64_t tablet_id) -> Result> { TabletMetaSharedPtr tablet_meta; auto start = std::chrono::steady_clock::now(); @@ -209,7 +210,6 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i } auto tablet = std::make_shared(_engine, std::move(tablet_meta)); - auto value = std::make_unique(tablet, *_tablet_map); // MUST sync stats to let compaction scheduler work correctly SyncOptions options; options.warmup_delta_data = warmup_data; @@ -219,16 +219,7 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i LOG(WARNING) << "failed to sync tablet " << tablet_id << ": " << st; return ResultError(st); } - - auto* handle = _cache->insert(key, value.release(), 1, sizeof(CloudTablet), - CachePriority::NORMAL); - auto ret = - std::shared_ptr(tablet.get(), [this, handle](CloudTablet* tablet) { - set_tablet_access_time_ms(tablet); - _cache->release(handle); - }); - _tablet_map->put(std::move(tablet)); - return ret; + return tablet; }; auto load_result = s_singleflight_load_tablet.load(tablet_id, std::move(load_tablet)); @@ -237,8 +228,22 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i load_result.error())); } auto tablet = load_result.value(); - set_tablet_access_time_ms(tablet.get()); - return tablet; + if (!cache_on_miss) { + set_tablet_access_time_ms(tablet.get()); + return tablet; + } + + auto value = std::make_unique(tablet, *_tablet_map); + auto* insert_handle = + _cache->insert(key, value.release(), 1, sizeof(CloudTablet), CachePriority::NORMAL); + auto ret = std::shared_ptr(tablet.get(), + [this, insert_handle](CloudTablet* tablet_ptr) { + set_tablet_access_time_ms(tablet_ptr); + _cache->release(insert_handle); + }); + _tablet_map->put(std::move(tablet)); + set_tablet_access_time_ms(ret.get()); + return ret; } if (sync_stats) { ++sync_stats->tablet_meta_cache_hit; @@ -252,6 +257,18 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i return tablet; } +bool CloudTabletMgr::peek_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta) { + if (tablet_meta == nullptr) { + return false; + } + auto tablet = _tablet_map->get(tablet_id); + if (!tablet) { + return false; + } + *tablet_meta = tablet->tablet_meta(); + return true; +} + void CloudTabletMgr::erase_tablet(int64_t tablet_id) { auto tablet_id_str = std::to_string(tablet_id); CacheKey key(tablet_id_str.data(), tablet_id_str.size()); diff --git a/be/src/cloud/cloud_tablet_mgr.h b/be/src/cloud/cloud_tablet_mgr.h index d7dde2134acc18..debd0c3fc17ae7 100644 --- a/be/src/cloud/cloud_tablet_mgr.h +++ b/be/src/cloud/cloud_tablet_mgr.h @@ -26,6 +26,7 @@ #include "common/status.h" #include "olap/olap_common.h" +#include "olap/tablet_fwd.h" namespace doris { @@ -44,10 +45,22 @@ class CloudTabletMgr { // If the tablet is in cache, return this tablet directly; otherwise will get tablet meta first, // sync rowsets after, and download segment data in background if `warmup_data` is true. + /* Parameters: + * - tablet_id: the id of tablet to get + * - warmup_data: whether warmup tablet data in background + * - sync_delete_bitmap: whether sync delete bitmap when getting tablet + * - sync_stats: the stats of sync rowset + * - force_use_only_cached: whether only use cached tablet meta + * - cache_on_miss: whether cache the tablet meta when missing in cache + */ Result> get_tablet(int64_t tablet_id, bool warmup_data = false, bool sync_delete_bitmap = true, SyncRowsetStats* sync_stats = nullptr, - bool local_only = false); + bool force_use_only_cached = false, + bool cache_on_miss = true); + + // Return true if cached tablet meta is found (without triggering RPC) and filled. + bool peek_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta); void erase_tablet(int64_t tablet_id); diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index fb1a005c22ad4e..2883d0e63a4bdc 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -234,13 +234,7 @@ void CloudWarmUpManager::handle_jobs() { continue; } - int64_t expiration_time = - tablet_meta->ttl_seconds() == 0 || rs->newest_write_timestamp() <= 0 - ? 0 - : rs->newest_write_timestamp() + tablet_meta->ttl_seconds(); - if (expiration_time <= UnixSeconds()) { - expiration_time = 0; - } + int64_t expiration_time = tablet_meta->ttl_seconds(); if (!tablet->add_rowset_warmup_state(*rs, WarmUpTriggerSource::JOB)) { LOG(INFO) << "found duplicate warmup task for rowset " << rs->rowset_id() << ", skip it"; diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 5862e68a738471..c54385ca2fc076 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1154,7 +1154,9 @@ DEFINE_mInt64(file_cache_background_block_lru_update_interval_ms, "5000"); DEFINE_mInt64(file_cache_background_block_lru_update_qps_limit, "1000"); DEFINE_mBool(enable_reader_dryrun_when_download_file_cache, "true"); DEFINE_mInt64(file_cache_background_monitor_interval_ms, "5000"); -DEFINE_mInt64(file_cache_background_ttl_gc_interval_ms, "3000"); +DEFINE_mInt64(file_cache_background_ttl_gc_interval_ms, "180000"); +DEFINE_mInt64(file_cache_background_ttl_info_update_interval_ms, "180000"); +DEFINE_mInt64(file_cache_background_tablet_id_flush_interval_ms, "1000"); DEFINE_mInt64(file_cache_background_ttl_gc_batch, "1000"); DEFINE_mInt64(file_cache_background_lru_dump_interval_ms, "60000"); // dump queue only if the queue update specific times through several dump intervals diff --git a/be/src/common/config.h b/be/src/common/config.h index 446b7f10f55a19..7368db4cc8d981 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1190,6 +1190,8 @@ DECLARE_mInt64(file_cache_background_block_lru_update_qps_limit); DECLARE_mBool(enable_reader_dryrun_when_download_file_cache); DECLARE_mInt64(file_cache_background_monitor_interval_ms); DECLARE_mInt64(file_cache_background_ttl_gc_interval_ms); +DECLARE_mInt64(file_cache_background_ttl_info_update_interval_ms); +DECLARE_mInt64(file_cache_background_tablet_id_flush_interval_ms); DECLARE_mInt64(file_cache_background_ttl_gc_batch); DECLARE_Int32(file_cache_downloader_thread_num_min); DECLARE_Int32(file_cache_downloader_thread_num_max); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index c7bbb9067f8752..4a9364b7290ff2 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -43,6 +43,7 @@ #include "common/config.h" #include "common/logging.h" #include "cpp/sync_point.h" +#include "io/cache/block_file_cache_ttl_mgr.h" #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" #include "io/cache/fs_file_cache_storage.h" @@ -461,8 +462,14 @@ Status BlockFileCache::initialize_unlocked(std::lock_guard& cache_lo restore_lru_queues_from_disk(cache_lock); } RETURN_IF_ERROR(_storage->init(this)); + + if (auto* fs_storage = dynamic_cast(_storage.get())) { + if (auto* meta_store = fs_storage->get_meta_store()) { + _ttl_mgr = std::make_unique(this, meta_store); + } + } + _cache_background_monitor_thread = std::thread(&BlockFileCache::run_background_monitor, this); - _cache_background_ttl_gc_thread = std::thread(&BlockFileCache::run_background_ttl_gc, this); _cache_background_gc_thread = std::thread(&BlockFileCache::run_background_gc, this); _cache_background_evict_in_advance_thread = std::thread(&BlockFileCache::run_background_evict_in_advance, this); @@ -566,88 +573,6 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte _files.erase(hash); return {}; } - // change to ttl if the blocks aren't ttl - if (context.cache_type == FileCacheType::TTL && _key_to_time.find(hash) == _key_to_time.end()) { - for (auto& [_, cell] : file_blocks) { - Status st = cell.file_block->update_expiration_time(context.expiration_time); - if (!st.ok()) { - LOG_WARNING("Failed to change key meta").error(st); - } - - FileCacheType origin_type = cell.file_block->cache_type(); - if (origin_type == FileCacheType::TTL) continue; - st = cell.file_block->change_cache_type_lock(FileCacheType::TTL, cache_lock); - if (st.ok()) { - auto& queue = get_queue(origin_type); - queue.remove(cell.queue_iterator.value(), cache_lock); - _lru_recorder->record_queue_event(origin_type, CacheLRULogType::REMOVE, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = - ttl_queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - _lru_recorder->record_queue_event(FileCacheType::TTL, CacheLRULogType::ADD, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - } else { - LOG_WARNING("Failed to change key meta").error(st); - } - } - _key_to_time[hash] = context.expiration_time; - _time_to_key.insert(std::make_pair(context.expiration_time, hash)); - } - if (auto iter = _key_to_time.find(hash); - // TODO(zhengyu): Why the hell the type is NORMAL while context set expiration_time? - (context.cache_type == FileCacheType::NORMAL || context.cache_type == FileCacheType::TTL) && - iter != _key_to_time.end() && iter->second != context.expiration_time) { - // remove from _time_to_key - auto _time_to_key_iter = _time_to_key.equal_range(iter->second); - while (_time_to_key_iter.first != _time_to_key_iter.second) { - if (_time_to_key_iter.first->second == hash) { - _time_to_key_iter.first = _time_to_key.erase(_time_to_key_iter.first); - break; - } - _time_to_key_iter.first++; - } - for (auto& [_, cell] : file_blocks) { - Status st = cell.file_block->update_expiration_time(context.expiration_time); - if (!st.ok()) { - LOG_WARNING("Failed to change key meta").error(st); - } - } - if (context.expiration_time == 0) { - for (auto& [_, cell] : file_blocks) { - auto cache_type = cell.file_block->cache_type(); - if (cache_type != FileCacheType::TTL) continue; - auto st = - cell.file_block->change_cache_type_lock(FileCacheType::NORMAL, cache_lock); - if (st.ok()) { - if (cell.queue_iterator) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - ttl_queue.remove(cell.queue_iterator.value(), cache_lock); - _lru_recorder->record_queue_event(FileCacheType::TTL, - CacheLRULogType::REMOVE, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - } - auto& queue = get_queue(FileCacheType::NORMAL); - cell.queue_iterator = - queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - _lru_recorder->record_queue_event(FileCacheType::NORMAL, CacheLRULogType::ADD, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - } else { - LOG_WARNING("Failed to change key meta").error(st); - } - } - _key_to_time.erase(iter); - } else { - _time_to_key.insert(std::make_pair(context.expiration_time, hash)); - iter->second = context.expiration_time; - } - } FileBlocks result; auto block_it = file_blocks.lower_bound(range.left); @@ -798,6 +723,9 @@ FileBlocks BlockFileCache::split_range_into_cells(const UInt128Wrapper& hash, cell->update_atime(); } } + if (_ttl_mgr && context.tablet_id != 0) { + _ttl_mgr->register_tablet_id(context.tablet_id); + } } current_pos += current_size; @@ -886,12 +814,6 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o int64_t duration = 0; { SCOPED_RAW_TIMER(&duration); - if (auto iter = _key_to_time.find(hash); - context.cache_type == FileCacheType::INDEX && iter != _key_to_time.end()) { - context.cache_type = FileCacheType::TTL; - context.expiration_time = iter->second; - } - /// Get all blocks which intersect with the given range. { SCOPED_RAW_TIMER(&stats->get_timer); @@ -970,10 +892,6 @@ FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& hash, const CacheC cell.size()); if (cell.file_block->cache_type() == FileCacheType::TTL) { - if (_key_to_time.find(hash) == _key_to_time.end()) { - _key_to_time[hash] = context.expiration_time; - _time_to_key.insert(std::make_pair(context.expiration_time, hash)); - } _cur_ttl_size += cell.size(); } auto [it, _] = offsets.insert(std::make_pair(offset, std::move(cell))); @@ -1049,33 +967,6 @@ void BlockFileCache::remove_file_blocks(std::vector& to_evict, std::for_each(to_evict.begin(), to_evict.end(), remove_file_block_if); } -void BlockFileCache::remove_file_blocks_and_clean_time_maps( - std::vector& to_evict, std::lock_guard& cache_lock) { - auto remove_file_block_and_clean_time_maps_if = [&](FileBlockCell* cell) { - FileBlockSPtr file_block = cell->file_block; - if (file_block) { - std::lock_guard block_lock(file_block->_mutex); - auto hash = cell->file_block->get_hash_value(); - remove(file_block, cache_lock, block_lock); - if (_files.find(hash) == _files.end()) { - if (auto iter = _key_to_time.find(hash); - _key_to_time.find(hash) != _key_to_time.end()) { - auto _time_to_key_iter = _time_to_key.equal_range(iter->second); - while (_time_to_key_iter.first != _time_to_key_iter.second) { - if (_time_to_key_iter.first->second == hash) { - _time_to_key_iter.first = _time_to_key.erase(_time_to_key_iter.first); - break; - } - _time_to_key_iter.first++; - } - _key_to_time.erase(hash); - } - } - } - }; - std::for_each(to_evict.begin(), to_evict.end(), remove_file_block_and_clean_time_maps_if); -} - void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, @@ -1228,99 +1119,22 @@ void BlockFileCache::try_evict_in_advance(size_t size, std::lock_guard& cache_lock, bool sync) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - if (auto iter = _key_to_time.find(file_key); - _key_to_time.find(file_key) != _key_to_time.end()) { - if (!remove_directly) { - auto it = _files.find(file_key); - if (it != _files.end()) { - for (auto& [_, cell] : it->second) { - if (cell.file_block->cache_type() != FileCacheType::TTL) { - continue; - } - Status st = cell.file_block->update_expiration_time(0); - if (!st.ok()) { - LOG_WARNING("Failed to update expiration time to 0").error(st); - } - - if (cell.file_block->cache_type() == FileCacheType::NORMAL) continue; - st = cell.file_block->change_cache_type_lock(FileCacheType::NORMAL, cache_lock); - if (st.ok()) { - if (cell.queue_iterator) { - ttl_queue.remove(cell.queue_iterator.value(), cache_lock); - _lru_recorder->record_queue_event( - FileCacheType::TTL, CacheLRULogType::REMOVE, - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.size()); - } - auto& queue = get_queue(FileCacheType::NORMAL); - cell.queue_iterator = queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - _lru_recorder->record_queue_event(FileCacheType::NORMAL, - CacheLRULogType::ADD, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - } else { - LOG_WARNING("Failed to change cache type to normal").error(st); - } - } - } - } else { - std::vector to_remove; - auto it = _files.find(file_key); - if (it != _files.end()) { - for (auto& [_, cell] : it->second) { - if (cell.releasable()) { - to_remove.push_back(&cell); - } else { - cell.file_block->set_deleting(); - } - } - } - std::for_each(to_remove.begin(), to_remove.end(), [&](FileBlockCell* cell) { - FileBlockSPtr file_block = cell->file_block; - std::lock_guard block_lock(file_block->_mutex); - remove(file_block, cache_lock, block_lock, sync); - }); - } - // remove from _time_to_key - // the param hash maybe be passed by _time_to_key, if removed it, cannot use it anymore - auto _time_to_key_iter = _time_to_key.equal_range(iter->second); - while (_time_to_key_iter.first != _time_to_key_iter.second) { - if (_time_to_key_iter.first->second == file_key) { - _time_to_key_iter.first = _time_to_key.erase(_time_to_key_iter.first); - break; - } - _time_to_key_iter.first++; - } - _key_to_time.erase(iter); - return true; - } - return false; -} - // remove specific cache synchronously, for critical operations // if in use, cache meta will be deleted after use and the block file is then deleted asynchronously void BlockFileCache::remove_if_cached(const UInt128Wrapper& file_key) { SCOPED_CACHE_LOCK(_mutex, this); - bool is_ttl_file = remove_if_ttl_file_blocks(file_key, true, cache_lock, true); - if (!is_ttl_file) { - auto iter = _files.find(file_key); - std::vector to_remove; - if (iter != _files.end()) { - for (auto& [_, cell] : iter->second) { - if (cell.releasable()) { - to_remove.push_back(&cell); - } else { - cell.file_block->set_deleting(); - } + auto iter = _files.find(file_key); + std::vector to_remove; + if (iter != _files.end()) { + for (auto& [_, cell] : iter->second) { + if (cell.releasable()) { + to_remove.push_back(&cell); + } else { + cell.file_block->set_deleting(); } } - remove_file_blocks(to_remove, cache_lock, true); } + remove_file_blocks(to_remove, cache_lock, true); } // the async version of remove_if_cached, for background operations @@ -1328,23 +1142,21 @@ void BlockFileCache::remove_if_cached(const UInt128Wrapper& file_key) { // if in use, cache meta will be deleted after use and the block file is then deleted asynchronously void BlockFileCache::remove_if_cached_async(const UInt128Wrapper& file_key) { SCOPED_CACHE_LOCK(_mutex, this); - bool is_ttl_file = remove_if_ttl_file_blocks(file_key, true, cache_lock, /*sync*/ false); - if (!is_ttl_file) { - auto iter = _files.find(file_key); - std::vector to_remove; - if (iter != _files.end()) { - for (auto& [_, cell] : iter->second) { - *_gc_evict_bytes_metrics << cell.size(); - *_gc_evict_count_metrics << 1; - if (cell.releasable()) { - to_remove.push_back(&cell); - } else { - cell.file_block->set_deleting(); - } + + auto iter = _files.find(file_key); + std::vector to_remove; + if (iter != _files.end()) { + for (auto& [_, cell] : iter->second) { + *_gc_evict_bytes_metrics << cell.size(); + *_gc_evict_count_metrics << 1; + if (cell.releasable()) { + to_remove.push_back(&cell); + } else { + cell.file_block->set_deleting(); } } - remove_file_blocks(to_remove, cache_lock, false); } + remove_file_blocks(to_remove, cache_lock, false); } std::vector BlockFileCache::get_other_cache_type_without_ttl( @@ -2095,38 +1907,6 @@ void BlockFileCache::run_background_monitor() { } } -void BlockFileCache::run_background_ttl_gc() { - Thread::set_self_name("run_background_ttl_gc"); - while (!_close) { - int64_t interval_ms = config::file_cache_background_ttl_gc_interval_ms; - int64_t batch_size = config::file_cache_background_ttl_gc_batch; - TEST_SYNC_POINT_CALLBACK("BlockFileCache::set_sleep_time", &interval_ms); - { - std::unique_lock close_lock(_close_mtx); - _close_cv.wait_for(close_lock, std::chrono::milliseconds(interval_ms)); - if (_close) { - break; - } - } - int64_t duration_ns = 0; - { - int64_t cur_time = UnixSeconds(); - int64_t count = 0; - SCOPED_CACHE_LOCK(_mutex, this); - SCOPED_RAW_TIMER(&duration_ns); - while (!_time_to_key.empty()) { - auto begin = _time_to_key.begin(); - if (cur_time < begin->first || count > batch_size) { - break; - } - remove_if_ttl_file_blocks(begin->second, false, cache_lock, false); - ++count; - } - } - *_ttl_gc_latency_us << (duration_ns / 1000); - } -} - void BlockFileCache::run_background_gc() { Thread::set_self_name("run_background_gc"); FileCacheKey key; @@ -2231,72 +2011,6 @@ void BlockFileCache::run_background_block_lru_update() { } } -void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, - uint64_t new_expiration_time) { - SCOPED_CACHE_LOCK(_mutex, this); - // 1. If new_expiration_time is equal to zero - if (new_expiration_time == 0) { - remove_if_ttl_file_blocks(hash, false, cache_lock, false); - return; - } - // 2. If the hash in ttl cache, modify its expiration time. - if (auto iter = _key_to_time.find(hash); iter != _key_to_time.end()) { - // remove from _time_to_key - auto _time_to_key_iter = _time_to_key.equal_range(iter->second); - while (_time_to_key_iter.first != _time_to_key_iter.second) { - if (_time_to_key_iter.first->second == hash) { - _time_to_key_iter.first = _time_to_key.erase(_time_to_key_iter.first); - break; - } - _time_to_key_iter.first++; - } - _time_to_key.insert(std::make_pair(new_expiration_time, hash)); - iter->second = new_expiration_time; - auto it = _files.find(hash); - if (it != _files.end()) { - for (auto& [_, cell] : it->second) { - Status st = cell.file_block->update_expiration_time(new_expiration_time); - if (!st.ok()) { - LOG_WARNING("Failed to modify expiration time").error(st); - } - } - } - - return; - } - // 3. change to ttl if the blocks aren't ttl - if (auto iter = _files.find(hash); iter != _files.end()) { - for (auto& [_, cell] : iter->second) { - Status st = cell.file_block->update_expiration_time(new_expiration_time); - if (!st.ok()) { - LOG_WARNING("").error(st); - } - - FileCacheType origin_type = cell.file_block->cache_type(); - if (origin_type == FileCacheType::TTL) continue; - st = cell.file_block->change_cache_type_lock(FileCacheType::TTL, cache_lock); - if (st.ok()) { - auto& queue = get_queue(origin_type); - queue.remove(cell.queue_iterator.value(), cache_lock); - _lru_recorder->record_queue_event(origin_type, CacheLRULogType::REMOVE, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = ttl_queue.add(hash, cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - _lru_recorder->record_queue_event(FileCacheType::TTL, CacheLRULogType::ADD, - cell.file_block->get_hash_value(), - cell.file_block->offset(), cell.size()); - } - if (!st.ok()) { - LOG_WARNING("").error(st); - } - } - _key_to_time[hash] = new_expiration_time; - _time_to_key.insert(std::make_pair(new_expiration_time, hash)); - } -} - std::vector> BlockFileCache::get_hot_blocks_meta(const UInt128Wrapper& hash) const { int64_t cur_time = std::chrono::duration_cast( @@ -2444,6 +2158,21 @@ std::map BlockFileCache::get_blocks_by_key(const UInt128W return offset_to_block; } +void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, uint64_t expiration_time) { + SCOPED_CACHE_LOCK(_mutex, this); + if (auto iter = _files.find(hash); iter != _files.end()) { + for (auto& [_, cell] : iter->second) { + if (cell.file_block) { + auto st = cell.file_block->update_expiration_time(expiration_time); + if (!st.ok()) { + LOG(WARNING) << "Failed to update expiration time for block " + << cell.file_block->get_info_for_log() << ", error=" << st; + } + } + } + } +} + void BlockFileCache::update_ttl_atime(const UInt128Wrapper& hash) { SCOPED_CACHE_LOCK(_mutex, this); if (auto iter = _files.find(hash); iter != _files.end()) { diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 83bc7a831a3226..6ce37a4794ce4c 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -32,6 +32,7 @@ #include #include +#include "io/cache/block_file_cache_ttl_mgr.h" #include "io/cache/cache_lru_dumper.h" #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" @@ -182,9 +183,6 @@ class BlockFileCache { if (_cache_background_monitor_thread.joinable()) { _cache_background_monitor_thread.join(); } - if (_cache_background_ttl_gc_thread.joinable()) { - _cache_background_ttl_gc_thread.join(); - } if (_cache_background_gc_thread.joinable()) { _cache_background_gc_thread.join(); } @@ -252,6 +250,8 @@ class BlockFileCache { std::string reset_capacity(size_t new_capacity); std::map get_blocks_by_key(const UInt128Wrapper& hash); + /// Adjust expiration time for every block sharing the specified hash key. + void modify_expiration_time(const UInt128Wrapper& hash, uint64_t expiration_time); /// For debug and UT std::string dump_structure(const UInt128Wrapper& hash); std::string dump_single_cache_type(const UInt128Wrapper& hash, size_t offset); @@ -270,9 +270,6 @@ class BlockFileCache { void remove_if_cached(const UInt128Wrapper& key); void remove_if_cached_async(const UInt128Wrapper& key); - // modify the expiration time about the key - void modify_expiration_time(const UInt128Wrapper& key, uint64_t new_expiration_time); - // Shrink the block size. old_size is always larged than new_size. void reset_range(const UInt128Wrapper&, size_t offset, size_t old_size, size_t new_size, std::lock_guard& cache_lock); @@ -453,11 +450,7 @@ class BlockFileCache { bool need_to_move(FileCacheType cell_type, FileCacheType query_type) const; - bool remove_if_ttl_file_blocks(const UInt128Wrapper& file_key, bool remove_directly, - std::lock_guard&, bool sync); - void run_background_monitor(); - void run_background_ttl_gc(); void run_background_gc(); void run_background_lru_log_replay(); void run_background_lru_dump(); @@ -481,9 +474,6 @@ class BlockFileCache { void remove_file_blocks(std::vector&, std::lock_guard&, bool sync); - void remove_file_blocks_and_clean_time_maps(std::vector&, - std::lock_guard&); - void find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, std::lock_guard& cache_lock, size_t& cur_removed_size, @@ -513,7 +503,6 @@ class BlockFileCache { std::mutex _close_mtx; std::condition_variable _close_cv; std::thread _cache_background_monitor_thread; - std::thread _cache_background_ttl_gc_thread; std::thread _cache_background_gc_thread; std::thread _cache_background_evict_in_advance_thread; std::thread _cache_background_lru_dump_thread; @@ -550,6 +539,7 @@ class BlockFileCache { std::unique_ptr _lru_recorder; std::unique_ptr _lru_dumper; + std::unique_ptr _ttl_mgr; // metrics std::shared_ptr> _cache_capacity_metrics; diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.cpp b/be/src/io/cache/block_file_cache_ttl_mgr.cpp new file mode 100644 index 00000000000000..082ac9623ffd18 --- /dev/null +++ b/be/src/io/cache/block_file_cache_ttl_mgr.cpp @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/block_file_cache_ttl_mgr.h" + +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "common/logging.h" +#include "io/cache/block_file_cache.h" +#include "io/cache/cache_block_meta_store.h" +#include "io/cache/file_block.h" +#include "olap/base_tablet.h" +#include "runtime/exec_env.h" +#include "util/time.h" + +namespace doris::io { + +BlockFileCacheTtlMgr::BlockFileCacheTtlMgr(BlockFileCache* mgr, CacheBlockMetaStore* meta_store) + : _mgr(mgr), _meta_store(meta_store), _stop_background(false) { + // Start background threads + _update_ttl_thread = + std::thread(&BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map, this); + _expiration_check_thread = + std::thread(&BlockFileCacheTtlMgr::run_backgroud_expiration_check, this); + _tablet_id_flush_thread = + std::thread(&BlockFileCacheTtlMgr::run_background_tablet_id_flush, this); +} + +BlockFileCacheTtlMgr::~BlockFileCacheTtlMgr() { + _stop_background.store(true, std::memory_order_release); + + if (_update_ttl_thread.joinable()) { + _update_ttl_thread.join(); + } + + if (_expiration_check_thread.joinable()) { + _expiration_check_thread.join(); + } + + if (_tablet_id_flush_thread.joinable()) { + _tablet_id_flush_thread.join(); + } +} + +void BlockFileCacheTtlMgr::register_tablet_id(int64_t tablet_id) { + _tablet_id_queue.enqueue(tablet_id); +} + +void BlockFileCacheTtlMgr::run_background_tablet_id_flush() { + Thread::set_self_name("ttl_mgr_flush"); + + static constexpr size_t kBatchSize = 1024; + std::vector pending; + pending.reserve(kBatchSize); + + auto flush_pending = [this](std::vector* items) { + if (items->empty()) { + return; + } + std::lock_guard lock(_tablet_id_mutex); + _tablet_id_set.insert(items->begin(), items->end()); + items->clear(); + }; + + auto drain_queue = [this, &pending, &flush_pending](bool* drained_flag) { + int64_t tablet_id = 0; + while (_tablet_id_queue.try_dequeue(tablet_id)) { + if (drained_flag != nullptr) { + *drained_flag = true; + } + pending.push_back(tablet_id); + if (pending.size() >= kBatchSize) { + flush_pending(&pending); + } + } + }; + + while (!_stop_background.load(std::memory_order_acquire)) { + bool drained = false; + drain_queue(&drained); + flush_pending(&pending); + + if (!drained) { + std::this_thread::sleep_for(std::chrono::milliseconds( + config::file_cache_background_tablet_id_flush_interval_ms)); + } + } + + // Drain remaining items before exit + drain_queue(nullptr); + flush_pending(&pending); +} + +FileBlocks BlockFileCacheTtlMgr::get_file_blocks_from_tablet_id(int64_t tablet_id) { + FileBlocks result; + + // Use meta store to get all blocks for this tablet + auto iterator = _meta_store->range_get(tablet_id); + if (!iterator) { + LOG(WARNING) << "Failed to get iterator for tablet_id: " << tablet_id; + return result; + } + + while (iterator->valid()) { + BlockMetaKey key = iterator->key(); + + // Get all blocks for this hash using get_blocks_by_key + try { + auto blocks_map = _mgr->get_blocks_by_key(key.hash); + for (const auto& [offset, block] : blocks_map) { + // Only add blocks that match our specific offset + if (offset == key.offset) { + result.push_back(block); + break; + } + } + } catch (const std::exception& e) { + LOG(WARNING) << "Failed to get file blocks for tablet_id: " << tablet_id + << ", hash: " << key.hash.to_string() << ", error: " << e.what(); + } + + iterator->next(); + } + + return result; +} + +void BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map() { + Thread::set_self_name("ttl_mgr_update"); + + while (!_stop_background.load(std::memory_order_acquire)) { + try { + std::unordered_set tablet_ids_to_process; + { + std::lock_guard lock(_tablet_id_mutex); + tablet_ids_to_process = _tablet_id_set; + } + + for (int64_t tablet_id : tablet_ids_to_process) { + if (_stop_background.load(std::memory_order_acquire)) { + break; + } + uint64_t tablet_ctime = 0; + uint64_t ttl = 0; + + TabletMetaSharedPtr tablet_meta; + auto meta_status = ExecEnv::get_tablet_meta(tablet_id, &tablet_meta, false); + if (!meta_status.ok()) { + LOG(WARNING) << "Failed to get tablet meta for tablet_id: " << tablet_id + << ", err: " << meta_status; + continue; + } + + if (tablet_meta != nullptr) { + tablet_ctime = tablet_meta->creation_time(); + int64_t ttl_seconds = tablet_meta->ttl_seconds(); + if (ttl_seconds > 0 && tablet_ctime > 0) { + ttl = static_cast(ttl_seconds); + } + } + + // Update TTL info map + { + std::lock_guard lock(_ttl_info_mutex); + if (ttl > 0) { + auto old_info_it = _ttl_info_map.find(tablet_id); + bool was_zero_ttl = (old_info_it == _ttl_info_map.end() || + old_info_it->second.ttl == 0); + _ttl_info_map[tablet_id] = TtlInfo {ttl, tablet_ctime}; + + // If TTL changed from 0 to non-zero, convert blocks to TTL type + if (was_zero_ttl) { + FileBlocks blocks = get_file_blocks_from_tablet_id(tablet_id); + for (auto& block : blocks) { + if (block->cache_type() != FileCacheType::TTL) { + auto change_status = + block->change_cache_type(FileCacheType::TTL); + if (!change_status.ok()) { + LOG(WARNING) << "Failed to convert block to TTL cache_type"; + } + } + } + } + } else { + // Remove from TTL map if TTL is 0 + _ttl_info_map.erase(tablet_id); + } + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds( + config::file_cache_background_ttl_info_update_interval_ms)); + + } catch (const std::exception& e) { + LOG(WARNING) << "Exception in TTL update thread: " << e.what(); + std::this_thread::sleep_for(std::chrono::seconds(10)); + } + } +} + +void BlockFileCacheTtlMgr::run_backgroud_expiration_check() { + Thread::set_self_name("ttl_mgr_expire"); + + while (!_stop_background.load(std::memory_order_acquire)) { + try { + std::map ttl_info_copy; + + // Copy TTL info for processing + { + std::lock_guard lock(_ttl_info_mutex); + ttl_info_copy = _ttl_info_map; + } + + uint64_t current_time = UnixSeconds(); + + for (const auto& [tablet_id, ttl_info] : ttl_info_copy) { + if (ttl_info.tablet_ctime + ttl_info.ttl < current_time) { + // Tablet has expired, convert TTL blocks back to NORMAL type + FileBlocks blocks = get_file_blocks_from_tablet_id(tablet_id); + for (auto& block : blocks) { + if (block->cache_type() == FileCacheType::TTL) { + auto st = block->change_cache_type(FileCacheType::NORMAL); + if (!st.ok()) { + LOG(WARNING) << "Failed to convert block back to NORMAL cache_type"; + } + } + } + } + } + + std::this_thread::sleep_for( + std::chrono::milliseconds(config::file_cache_background_ttl_gc_interval_ms)); + + } catch (const std::exception& e) { + LOG(WARNING) << "Exception in TTL expiration check thread: " << e.what(); + std::this_thread::sleep_for(std::chrono::seconds(10)); + } + } +} + +} // namespace doris::io \ No newline at end of file diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.h b/be/src/io/cache/block_file_cache_ttl_mgr.h new file mode 100644 index 00000000000000..c08438d5d6d8d2 --- /dev/null +++ b/be/src/io/cache/block_file_cache_ttl_mgr.h @@ -0,0 +1,78 @@ + + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "io/cache/file_block.h" +#include "io/cache/file_cache_common.h" + +namespace doris::io { + +class BlockFileCache; +class CacheBlockMetaStore; + +struct TtlInfo { + uint64_t ttl; + uint64_t tablet_ctime; +}; + +class BlockFileCacheTtlMgr { +public: + BlockFileCacheTtlMgr(BlockFileCache* mgr, CacheBlockMetaStore* meta_store); + ~BlockFileCacheTtlMgr(); + + void register_tablet_id(int64_t tablet_id); + + // Background thread to update ttl_info_map + void run_backgroud_update_ttl_info_map(); + // Background thread to find expired tablet and evict from ttl queue + void run_backgroud_expiration_check(); + // Background thread to drain the concurrent tablet-id queue into the dedup set + void run_background_tablet_id_flush(); + +private: + FileBlocks get_file_blocks_from_tablet_id(int64_t tablet_id); + +private: + // Tablet ids waiting to be deduplicated + set of unique ids known to have cached data + moodycamel::ConcurrentQueue _tablet_id_queue; + std::unordered_set _tablet_id_set; // TODO(zhengyu): clean up old tablet ids + std::mutex _tablet_id_mutex; + + std::map _ttl_info_map; + BlockFileCache* _mgr; + CacheBlockMetaStore* _meta_store; + + std::atomic _stop_background; + std::thread _update_ttl_thread; + std::thread _expiration_check_thread; + std::thread _tablet_id_flush_thread; + + std::mutex _ttl_info_mutex; +}; + +} // namespace doris::io \ No newline at end of file diff --git a/be/src/io/cache/cache_block_meta_store.cpp b/be/src/io/cache/cache_block_meta_store.cpp index 433bee7a275152..472886152c71da 100644 --- a/be/src/io/cache/cache_block_meta_store.cpp +++ b/be/src/io/cache/cache_block_meta_store.cpp @@ -74,6 +74,10 @@ size_t CacheBlockMetaStore::get_write_queue_size() const { } Status CacheBlockMetaStore::init() { + if (_initialized.load(std::memory_order_acquire)) { + return Status::OK(); + } + std::filesystem::create_directories(_db_path); _options.create_if_missing = true; @@ -119,6 +123,7 @@ Status CacheBlockMetaStore::init() { } _write_thread = std::thread(&CacheBlockMetaStore::async_write_worker, this); + _initialized.store(true, std::memory_order_release); return Status::OK(); } diff --git a/be/src/io/cache/cache_block_meta_store.h b/be/src/io/cache/cache_block_meta_store.h index 9412e58f4fee7f..bd3c6501c145a3 100644 --- a/be/src/io/cache/cache_block_meta_store.h +++ b/be/src/io/cache/cache_block_meta_store.h @@ -118,6 +118,7 @@ class CacheBlockMetaStore { std::unique_ptr _db; rocksdb::Options _options; std::unique_ptr _file_cache_meta_cf_handle; + std::atomic _initialized {false}; enum class OperationType { PUT, DELETE }; struct WriteOperation { diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index d095e859badbcd..dc8fd795dde382 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -43,6 +43,7 @@ #include "io/cache/block_file_cache_profile.h" #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" +#include "io/cache/peer_file_cache_reader.h" #include "io/fs/file_reader.h" #include "io/fs/local_file_system.h" #include "io/io_common.h" diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 5e58052d7ce12f..caf20823da147c 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -137,6 +137,7 @@ FileBlocksHolderPtr FileCacheAllocatorBuilder::allocate_cache_holder(size_t offs ctx.cache_type = _expiration_time == 0 ? FileCacheType::NORMAL : FileCacheType::TTL; ctx.expiration_time = _expiration_time; ctx.is_cold_data = _is_cold_data; + ctx.tablet_id = tablet_id; ReadStatistics stats; ctx.stats = &stats; auto holder = _cache->get_or_set(_cache_hash, offset, size, ctx); diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 0556be80c15257..b2055b3640e7f9 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -892,7 +892,7 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const << ", Estimated FS files: " << estimated_file_count; // If the difference is more than threshold, load from filesystem as well - if (estimated_file_count > 0) { + if (estimated_file_count > 100) { double difference_ratio = static_cast(estimated_file_count) - static_cast(db_block_count) / static_cast(estimated_file_count); @@ -912,6 +912,12 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const } // TODO(zhengyu): use anti-leak machinism to remove v2 format directory } + } else { + LOG(INFO) << "FS contains low number of files, num = " << estimated_file_count + << ", skipping FS load."; + if (st = write_file_cache_version(); !st.ok()) { + LOG(WARNING) << "Failed to write version hints for file cache, err=" << st.to_string(); + } } } diff --git a/be/src/io/fs/file_writer.h b/be/src/io/fs/file_writer.h index de298a7a0bf38d..a22fc28c26e759 100644 --- a/be/src/io/fs/file_writer.h +++ b/be/src/io/fs/file_writer.h @@ -46,7 +46,7 @@ struct FileWriterOptions { bool write_file_cache = false; bool is_cold_data = false; bool sync_file_data = true; // Whether flush data into storage system - uint64_t file_cache_expiration = 0; // Absolute time + uint64_t file_cache_expiration_time = 0; // Relative time uint64_t approximate_bytes_to_write = 0; // Approximate bytes to write, used for file cache }; @@ -108,7 +108,7 @@ class FileWriter { << file_cache_ptr->approximate_available_cache_size(); if (opts->write_file_cache || has_enough_file_cache_space) { _cache_builder = std::make_unique(FileCacheAllocatorBuilder { - opts ? opts->is_cold_data : false, opts ? opts->file_cache_expiration : 0, + opts ? opts->is_cold_data : false, opts ? opts->file_cache_expiration_time : 0, path_hash, file_cache_ptr}); } return; diff --git a/be/src/io/tools/file_cache_microbench.cpp b/be/src/io/tools/file_cache_microbench.cpp index 6da84b8fe48114..50d1e376d86359 100644 --- a/be/src/io/tools/file_cache_microbench.cpp +++ b/be/src/io/tools/file_cache_microbench.cpp @@ -1068,7 +1068,7 @@ class JobManager { DataGenerator data_generator(config.size_bytes_perfile); doris::io::FileWriterOptions options; if (config.cache_type == "TTL") { - options.file_cache_expiration = config.expiration; + options.file_cache_expiration_time = config.expiration; } options.write_file_cache = config.write_file_cache; auto writer = std::make_unique( diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 3d4f4a12554d0a..702e007ead8fcc 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -220,13 +220,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context _read_context->runtime_state->query_options().disable_file_cache; } - _read_options.io_ctx.expiration_time = - read_context->ttl_seconds > 0 && _rowset->rowset_meta()->newest_write_timestamp() > 0 - ? _rowset->rowset_meta()->newest_write_timestamp() + read_context->ttl_seconds - : 0; - if (_read_options.io_ctx.expiration_time <= UnixSeconds()) { - _read_options.io_ctx.expiration_time = 0; - } + _read_options.io_ctx.expiration_time = read_context->ttl_seconds; bool enable_segment_cache = true; auto* state = read_context->runtime_state; diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index b61f52d9632382..d3cc9d23bdb710 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -171,13 +171,10 @@ struct RowsetWriterContext { io::FileSystem& fs_ref() { return *fs(); } io::FileWriterOptions get_file_writer_options() { - io::FileWriterOptions opts { - .write_file_cache = write_file_cache, - .is_cold_data = is_hot_data, - .file_cache_expiration = file_cache_ttl_sec > 0 && newest_write_timestamp > 0 - ? newest_write_timestamp + file_cache_ttl_sec - : 0, - .approximate_bytes_to_write = approximate_bytes_to_write}; + io::FileWriterOptions opts {.write_file_cache = write_file_cache, + .is_cold_data = is_hot_data, + .file_cache_expiration_time = file_cache_ttl_sec, + .approximate_bytes_to_write = approximate_bytes_to_write}; return opts; } diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 271742c49f889d..0c8fe3b120279b 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -1351,7 +1351,7 @@ Status StorageEngine::create_tablet(const TCreateTabletReq& request, RuntimeProf } Result StorageEngine::get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats, - bool force_use_only_cached) { + bool force_use_only_cached, bool cache_on_miss) { BaseTabletSPtr tablet; std::string err; tablet = _tablet_manager->get_tablet(tablet_id, true, &err); @@ -1362,6 +1362,21 @@ Result StorageEngine::get_tablet(int64_t tablet_id, SyncRowsetSt return tablet; } +Status StorageEngine::get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached) { + if (tablet_meta == nullptr) { + return Status::InvalidArgument("tablet_meta output is null"); + } + + auto res = get_tablet(tablet_id, nullptr, force_use_only_cached, true); + if (!res.has_value()) { + return res.error(); + } + + *tablet_meta = res.value()->tablet_meta(); + return Status::OK(); +} + Status StorageEngine::obtain_shard_path(TStorageMedium::type storage_medium, int64_t path_hash, std::string* shard_path, DataDir** store, int64_t partition_id) { diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index b973cb4830a254..0977d66a802939 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -112,9 +112,19 @@ class BaseStorageEngine { // start all background threads. This should be call after env is ready. virtual Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) = 0; + /* Parameters: + * - tablet_id: the id of tablet to get + * - sync_stats: the stats of sync rowset + * - force_use_only_cached: whether only use cached tablet meta + * - cache_on_miss: whether cache the tablet meta when missing in cache + */ virtual Result get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats = nullptr, - bool force_use_only_cached = false) = 0; + bool force_use_only_cached = false, + bool cache_on_miss = true) = 0; + + virtual Status get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached = false) = 0; void register_report_listener(ReportWorker* listener); void deregister_report_listener(ReportWorker* listener); @@ -237,8 +247,18 @@ class StorageEngine final : public BaseStorageEngine { Status create_tablet(const TCreateTabletReq& request, RuntimeProfile* profile); + /* Parameters: + * - tablet_id: the id of tablet to get + * - sync_stats: the stats of sync rowset + * - force_use_only_cached: whether only use cached tablet meta + * - cache_on_miss: whether cache the tablet meta when missing in cache + */ Result get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats = nullptr, - bool force_use_only_cached = false) override; + bool force_use_only_cached = false, + bool cache_on_miss = true) override; + + Status get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached = false) override; void clear_transaction_task(const TTransactionId transaction_id); void clear_transaction_task(const TTransactionId transaction_id, diff --git a/be/src/runtime/exec_env.cpp b/be/src/runtime/exec_env.cpp index 4bc3818b56b1bb..a1228434123c85 100644 --- a/be/src/runtime/exec_env.cpp +++ b/be/src/runtime/exec_env.cpp @@ -53,13 +53,23 @@ void ExecEnv::set_write_cooldown_meta_executors() { #endif // BE_TEST Result ExecEnv::get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats, - bool force_use_only_cached) { + bool force_use_only_cached, bool cache_on_miss) { auto storage_engine = GetInstance()->_storage_engine.get(); return storage_engine != nullptr - ? storage_engine->get_tablet(tablet_id, sync_stats, force_use_only_cached) + ? storage_engine->get_tablet(tablet_id, sync_stats, force_use_only_cached, + cache_on_miss) : ResultError(Status::InternalError("failed to get tablet {}", tablet_id)); } +Status ExecEnv::get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached) { + auto storage_engine = GetInstance()->_storage_engine.get(); + if (storage_engine == nullptr) { + return Status::InternalError("storage engine is not initialized"); + } + return storage_engine->get_tablet_meta(tablet_id, tablet_meta, force_use_only_cached); +} + const std::string& ExecEnv::token() const { return _cluster_info->token; } diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 9200f800c0b2aa..d9f8838c89f410 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -169,9 +169,20 @@ class ExecEnv { } // Requires ExenEnv ready + /* + * Parameters: + * - tablet_id: the id of tablet to get + * - sync_stats: the stats of sync rowset + * - force_use_only_cached: whether only use cached data + * - cache_on_miss: whether cache the tablet meta when missing in cache + */ static Result get_tablet(int64_t tablet_id, SyncRowsetStats* sync_stats = nullptr, - bool force_use_only_cached = false); + bool force_use_only_cached = false, + bool cache_on_miss = true); + + static Status get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool force_use_only_cached = false); static bool ready() { return _s_ready.load(std::memory_order_acquire); } static bool tracking_memory() { return _s_tracking_memory.load(std::memory_order_acquire); } diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index 568539bd5197a2..c3a92445c72780 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -1043,8 +1043,7 @@ TEST_F(BlockFileCacheTest, max_ttl_size) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; + context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key5"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -1128,8 +1127,7 @@ TEST_F(BlockFileCacheTest, max_ttl_size_with_other_cache_exist) { // then get started with TTL context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; + context.expiration_time = 120; offset = 0; for (; offset < 100000000; offset += 100000) { auto holder = cache.get_or_set(key1, offset, 100000, context); @@ -1169,8 +1167,7 @@ TEST_F(BlockFileCacheTest, max_ttl_size_memory_storage) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; + context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key5"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -2103,9 +2100,8 @@ TEST_F(BlockFileCacheTest, ttl_normal) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - int64_t modify_time = cur_time + 5; + context.expiration_time = 120; + int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2202,9 +2198,8 @@ TEST_F(BlockFileCacheTest, ttl_modify) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - int64_t modify_time = cur_time + 5; + context.expiration_time = 120; + int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2284,9 +2279,8 @@ TEST_F(BlockFileCacheTest, ttl_modify_memory_storage) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - int64_t modify_time = cur_time + 5; + context.expiration_time = 120; + int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2362,8 +2356,7 @@ TEST_F(BlockFileCacheTest, ttl_change_to_normal) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; + context.expiration_time = 180; auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -2424,8 +2417,7 @@ TEST_F(BlockFileCacheTest, ttl_change_to_normal_memory_storage) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; + context.expiration_time = 180; auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -2482,9 +2474,8 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; - int64_t change_time = cur_time + 120; + context.expiration_time = 180; + int64_t change_time = 120; auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -2554,9 +2545,8 @@ TEST_F(BlockFileCacheTest, ttl_change_expiration_time_memory_storage) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 180; - int64_t change_time = cur_time + 120; + context.expiration_time = 180; + int64_t change_time = 120; auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -2794,7 +2784,7 @@ TEST_F(BlockFileCacheTest, remove_directly_when_normal_change_to_ttl) { } context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; + context.expiration_time = 3600; { auto holder = cache.get_or_set(key1, 0, 5, context); auto blocks = fromHolder(holder); @@ -2847,8 +2837,7 @@ TEST_F(BlockFileCacheTest, ttl_gc) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 2; + context.expiration_time = 2; for (int64_t i = 0; i < 12; ++i) { auto key = io::BlockFileCache::hash(fmt::format("key{}", i)); @@ -2965,7 +2954,7 @@ TEST_F(BlockFileCacheTest, recyle_cache_async_ttl) { auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; + context.expiration_time = 3600; FileBlocksHolder* holder; auto sp = SyncPoint::get_instance(); SyncPoint::CallbackGuard guard1; @@ -3050,7 +3039,7 @@ TEST_F(BlockFileCacheTest, remove_directly) { auto key2 = io::BlockFileCache::hash("key2"); io::BlockFileCache cache(cache_base_path, settings); context.cache_type = io::FileCacheType::TTL; - context.expiration_time = UnixSeconds() + 3600; + context.expiration_time = 3600; ASSERT_TRUE(cache.initialize()); for (int i = 0; i < 100; i++) { if (cache.get_async_open_success()) { @@ -3281,7 +3270,7 @@ TEST_F(BlockFileCacheTest, test_cache_context) { } { io::IOContext io_ctx; - int64_t expiration_time = UnixSeconds() + 120; + int64_t expiration_time = 120; io_ctx.expiration_time = expiration_time; CacheContext cache_context; cache_context.cache_type = FileCacheType::TTL; @@ -4151,7 +4140,7 @@ TEST_F(BlockFileCacheTest, test_hot_data) { context.stats = &rstats; auto key1 = io::BlockFileCache::hash("key1"); auto key2 = io::BlockFileCache::hash("key2"); - int64_t expiration_time = UnixSeconds() + 300; + int64_t expiration_time = 300; io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); for (int i = 0; i < 100; i++) { @@ -4916,8 +4905,7 @@ TEST_F(BlockFileCacheTest, reset_capacity) { io::FileBlock::State::DOWNLOADED); } context.cache_type = io::FileCacheType::TTL; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; + context.expiration_time = 120; for (int64_t offset = 45; offset < 90; offset += 5) { auto holder = cache.get_or_set(key2, offset, 5, context); auto segments = fromHolder(holder); @@ -4961,9 +4949,8 @@ TEST_F(BlockFileCacheTest, change_cache_type1) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; - int64_t modify_time = cur_time + 5; + context.expiration_time = 120; + int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -5036,7 +5023,6 @@ TEST_F(BlockFileCacheTest, change_cache_type2) { ReadStatistics rstats; context.stats = &rstats; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); context.cache_type = io::FileCacheType::NORMAL; context.expiration_time = 0; auto key1 = io::BlockFileCache::hash("key1"); @@ -5062,7 +5048,7 @@ TEST_F(BlockFileCacheTest, change_cache_type2) { EXPECT_EQ(segments[0]->expiration_time(), 0); } context.cache_type = io::FileCacheType::TTL; - context.expiration_time = cur_time + 120; + context.expiration_time = 120; { auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] auto segments = fromHolder(holder); @@ -5110,8 +5096,7 @@ TEST_F(BlockFileCacheTest, change_cache_type2) { } fs::create_directories(cache_base_path); test_file_cache(FileCacheType::NORMAL); - int64_t cur_time = UnixSeconds(); - int64_t expiration_time = cur_time + 120; + int64_t expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); ASSERT_TRUE(global_local_filesystem() ->rename(cache_base_path + "/" + key1.to_string().substr(0, 3) + "/" + @@ -5420,7 +5405,7 @@ TEST_F(BlockFileCacheTest, DISABLE_check_file_cache_consistency) { blocks[0]->_key.meta.type = io::FileCacheType::INDEX; } - int64_t expiration_time = UnixSeconds() + 120; + int64_t expiration_time = 120; { cache_context.cache_type = FileCacheType::TTL; cache_context.expiration_time = expiration_time; @@ -5494,8 +5479,7 @@ TEST_F(BlockFileCacheTest, populate_empty_cache_with_disposable) { context.stats = &rstats; context.cache_type = io::FileCacheType::DISPOSABLE; context.query_id = query_id; - // int64_t cur_time = UnixSeconds(); - // context.expiration_time = cur_time + 120; + // context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -5601,8 +5585,7 @@ TEST_F(BlockFileCacheTest, populate_empty_cache_with_normal) { context.stats = &rstats; context.cache_type = io::FileCacheType::NORMAL; context.query_id = query_id; - // int64_t cur_time = UnixSeconds(); - // context.expiration_time = cur_time + 120; + // context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -5707,8 +5690,7 @@ TEST_F(BlockFileCacheTest, populate_empty_cache_with_index) { context.stats = &rstats; context.cache_type = io::FileCacheType::INDEX; context.query_id = query_id; - // int64_t cur_time = UnixSeconds(); - // context.expiration_time = cur_time + 120; + // context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -5813,8 +5795,7 @@ TEST_F(BlockFileCacheTest, populate_empty_cache_with_ttl) { context.stats = &rstats; context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; - int64_t cur_time = UnixSeconds(); - context.expiration_time = cur_time + 120; + context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -6090,8 +6071,7 @@ TEST_F(BlockFileCacheTest, seize_after_full) { context1.cache_type = args.first_type; context1.query_id = query_id; if (args.first_type == io::FileCacheType::TTL) { - int64_t cur_time = UnixSeconds(); - context1.expiration_time = cur_time + 120; + context1.expiration_time = 120; } auto key1 = io::BlockFileCache::hash("key1"); @@ -6118,8 +6098,7 @@ TEST_F(BlockFileCacheTest, seize_after_full) { context2.cache_type = args.second_type; context2.query_id = query_id; if (context2.cache_type == io::FileCacheType::TTL) { - int64_t cur_time = UnixSeconds(); - context2.expiration_time = cur_time + 120; + context2.expiration_time = 120; } auto key2 = io::BlockFileCache::hash("key2"); offset = 0; @@ -6251,7 +6230,7 @@ TEST_F(BlockFileCacheTest, evict_privilege_order_for_disposable) { context3.stats = &rstats; context3.cache_type = io::FileCacheType::TTL; context3.query_id = query_id; - context3.expiration_time = UnixSeconds() + 120; + context3.expiration_time = 120; auto key3 = io::BlockFileCache::hash("key3"); offset = 0; @@ -6430,7 +6409,7 @@ TEST_F(BlockFileCacheTest, evict_privilege_order_for_normal) { context3.stats = &rstats; context3.cache_type = io::FileCacheType::TTL; context3.query_id = query_id; - context3.expiration_time = UnixSeconds() + 120; + context3.expiration_time = 120; auto key3 = io::BlockFileCache::hash("key3"); offset = 0; @@ -6608,7 +6587,7 @@ TEST_F(BlockFileCacheTest, evict_privilege_order_for_index) { context3.stats = &rstats; context3.cache_type = io::FileCacheType::TTL; context3.query_id = query_id; - context3.expiration_time = UnixSeconds() + 120; + context3.expiration_time = 120; auto key3 = io::BlockFileCache::hash("key3"); offset = 0; @@ -6814,7 +6793,7 @@ TEST_F(BlockFileCacheTest, evict_privilege_order_for_ttl) { context4.stats = &rstats; context4.cache_type = io::FileCacheType::TTL; context4.query_id = query_id; - context4.expiration_time = UnixSeconds() + 120; + context4.expiration_time = 120; auto key4 = io::BlockFileCache::hash("key4"); offset = 0; @@ -6917,8 +6896,7 @@ TEST_F(BlockFileCacheTest, evict_in_advance) { context.stats = &rstats; context.cache_type = io::FileCacheType::NORMAL; context.query_id = query_id; - // int64_t cur_time = UnixSeconds(); - // context.expiration_time = cur_time + 120; + // context.expiration_time = 120; auto key1 = io::BlockFileCache::hash("key1"); io::BlockFileCache cache(cache_base_path, settings); ASSERT_TRUE(cache.initialize()); @@ -7520,8 +7498,7 @@ TEST_F(BlockFileCacheTest, DISABLE_test_upgrade_cache_dir_version) { context.stats = &rstats; context.cache_type = io::FileCacheType::NORMAL; context.query_id = query_id; - // int64_t cur_time = UnixSeconds(); - // context.expiration_time = cur_time + 120; + // context.expiration_time = 120; LOG(INFO) << "start from empty"; auto key1 = io::BlockFileCache::hash("key1"); config::ignore_file_cache_dir_upgrade_failure = true; @@ -7874,8 +7851,7 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_ttl_index) { IOContext io_ctx; FileCacheStatistics stats; io_ctx.file_cache_stats = &stats; - int64_t cur_time = UnixSeconds(); - io_ctx.expiration_time = cur_time + 120; + io_ctx.expiration_time = 120; size_t bytes_read {0}; EXPECT_TRUE( reader.read_at(0, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); @@ -7958,8 +7934,7 @@ TEST_F(BlockFileCacheTest, cached_remote_file_reader_normal_index) { FileCacheStatistics stats; io_ctx.file_cache_stats = &stats; io_ctx.is_index_data = true; - // int64_t cur_time = UnixSeconds(); - // io_ctx.expiration_time = cur_time + 120; + // io_ctx.expiration_time = 120; size_t bytes_read {0}; EXPECT_TRUE( reader.read_at(0, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); diff --git a/be/test/io/cache/block_file_cache_test_lru_dump.cpp b/be/test/io/cache/block_file_cache_test_lru_dump.cpp index 65762b216a09b6..8172de487fc50e 100644 --- a/be/test/io/cache/block_file_cache_test_lru_dump.cpp +++ b/be/test/io/cache/block_file_cache_test_lru_dump.cpp @@ -108,7 +108,7 @@ TEST_F(BlockFileCacheTest, test_lru_log_record_replay_dump_restore) { context3.stats = &rstats; context3.cache_type = io::FileCacheType::TTL; context3.query_id = query_id; - context3.expiration_time = UnixSeconds() + 120; + context3.expiration_time = 120; auto key3 = io::BlockFileCache::hash("key3"); offset = 0; diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index 2f08c173962c18..3fc652b466e7be 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -65,7 +65,7 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { settings.max_file_block_size = 100000; settings.max_query_cache_size = 30; - uint64_t expiration_time = UnixSeconds() + 120; + uint64_t expiration_time = 120; int i = 0; { // cache1 diff --git a/be/test/io/cache/block_file_cache_ttl_mgr_test.cpp b/be/test/io/cache/block_file_cache_ttl_mgr_test.cpp new file mode 100644 index 00000000000000..9e965aa4351298 --- /dev/null +++ b/be/test/io/cache/block_file_cache_ttl_mgr_test.cpp @@ -0,0 +1,348 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/block_file_cache_ttl_mgr.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "common/status.h" +#include "io/cache/block_file_cache.h" +#include "io/cache/cache_block_meta_store.h" +#include "io/cache/file_block.h" +#include "io/cache/file_cache_common.h" +#include "olap/base_tablet.h" +#include "olap/storage_engine.h" +#include "runtime/exec_env.h" +#include "util/slice.h" +#include "util/time.h" +#include "util/uid_util.h" + +namespace doris::io { + +namespace fs = std::filesystem; + +namespace { + +class FakeTablet : public BaseTablet { +public: + FakeTablet(int64_t creation_time, int64_t ttl_seconds) + : BaseTablet(create_meta(creation_time, ttl_seconds)) {} + + void set_creation_time(int64_t value) { _tablet_meta->set_creation_time(value); } + + void set_ttl_seconds(int64_t value) { _tablet_meta->set_ttl_seconds(value); } + + std::string tablet_path() const override { return ""; } + + bool exceed_version_limit(int32_t /*limit*/) override { return false; } + + Result> create_rowset_writer(RowsetWriterContext& /*context*/, + bool /*vertical*/) override { + return ResultError(Status::NotSupported("fake tablet")); + } + + Result> create_transient_rowset_writer( + const Rowset& /*rowset*/, std::shared_ptr /*partial_update_info*/, + int64_t /*txn_expiration*/ = 0) override { + return ResultError(Status::NotSupported("fake tablet")); + } + + Status capture_rs_readers(const Version& /*spec_version*/, + std::vector* /*rs_splits*/, + const CaptureRowsetOps& /*opts*/) override { + return Status::NotSupported("fake tablet"); + } + + Status save_delete_bitmap(const TabletTxnInfo* /*txn_info*/, int64_t /*txn_id*/, + DeleteBitmapPtr /*delete_bitmap*/, RowsetWriter* /*rowset_writer*/, + const RowsetIdUnorderedSet& /*cur_rowset_ids*/, + int64_t /*lock_id*/ = -1, + int64_t /*next_visible_version*/ = -1) override { + return Status::NotSupported("fake tablet"); + } + + CalcDeleteBitmapExecutor* calc_delete_bitmap_executor() override { return nullptr; } + + void clear_cache() override {} + + Versions calc_missed_versions(int64_t /*spec_version*/, + Versions /*existing_versions*/) const override { + return {}; + } + + size_t tablet_footprint() override { return 0; } + +private: + static TabletMetaSharedPtr create_meta(int64_t creation_time, int64_t ttl_seconds) { + auto schema = std::make_shared(); + auto meta = std::make_shared(schema); + meta->set_creation_time(creation_time); + meta->set_ttl_seconds(ttl_seconds); + return meta; + } +}; + +class FakeStorageEngine : public BaseStorageEngine { +public: + FakeStorageEngine() : BaseStorageEngine(BaseStorageEngine::Type::LOCAL, UniqueId::gen_uid()) {} + + Status open() override { return Status::OK(); } + + void stop() override {} + + bool stopped() override { return false; } + + Status start_bg_threads(std::shared_ptr /*wg_sptr*/ = nullptr) override { + return Status::OK(); + } + + Result get_tablet(int64_t tablet_id, SyncRowsetStats* /*sync_stats*/ = nullptr, + bool /*force_use_cache*/ = false, + bool /*cache_on_miss*/ = true) override { + std::lock_guard lock(_mutex); + auto it = _tablets.find(tablet_id); + if (it == _tablets.end()) { + return ResultError(Status::NotFound("tablet {} not found", tablet_id)); + } + return it->second; + } + + Status get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tablet_meta, + bool /*force_use_only_cached*/ = false) override { + auto tablet_res = get_tablet(tablet_id); + if (!tablet_res.has_value()) { + return tablet_res.error(); + } + if (tablet_meta != nullptr) { + *tablet_meta = tablet_res.value()->tablet_meta(); + } + return Status::OK(); + } + + Status set_cluster_id(int32_t /*cluster_id*/) override { return Status::OK(); } + + void add_tablet(int64_t tablet_id, const BaseTabletSPtr& tablet) { + std::lock_guard lock(_mutex); + _tablets[tablet_id] = tablet; + } + +private: + std::mutex _mutex; + std::unordered_map _tablets; +}; + +std::vector blocks_from_holder(const FileBlocksHolder& holder) { + return std::vector(holder.file_blocks.begin(), holder.file_blocks.end()); +} + +template +bool wait_for_condition(Predicate&& predicate, std::chrono::milliseconds timeout, + std::chrono::milliseconds interval = std::chrono::milliseconds(20)) { + auto deadline = std::chrono::steady_clock::now() + timeout; + while (std::chrono::steady_clock::now() < deadline) { + if (predicate()) { + return true; + } + std::this_thread::sleep_for(interval); + } + return predicate(); +} + +} // namespace + +class BlockFileCacheTtlMgrTest : public testing::Test { +protected: + void SetUp() override { + _orig_ttl_update_interval = config::file_cache_background_ttl_info_update_interval_ms; + _orig_ttl_gc_interval = config::file_cache_background_ttl_gc_interval_ms; + _orig_tablet_flush_interval = config::file_cache_background_tablet_id_flush_interval_ms; + + config::file_cache_background_ttl_info_update_interval_ms = 20; + config::file_cache_background_ttl_gc_interval_ms = 20; + config::file_cache_background_tablet_id_flush_interval_ms = 5; + + _test_root = fs::temp_directory_path() / "block_file_cache_ttl_mgr_test"; + if (fs::exists(_test_root)) { + fs::remove_all(_test_root); + } + fs::create_directories(_test_root); + _cache_dir = (_test_root / "cache").string(); + _meta_dir = (_test_root / "meta").string(); + + auto engine = std::make_unique(); + _fake_engine = engine.get(); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + + _meta_store = std::make_unique(_meta_dir, 32); + ASSERT_TRUE(_meta_store->init().ok()); + + FileCacheSettings settings; + settings.capacity = 4 * 1024 * 1024; + settings.max_file_block_size = 1024; + settings.ttl_queue_size = settings.capacity; + settings.ttl_queue_elements = 128; + settings.query_queue_size = settings.capacity; + settings.query_queue_elements = 128; + settings.index_queue_size = settings.capacity; + settings.index_queue_elements = 128; + settings.disposable_queue_size = settings.capacity; + settings.disposable_queue_elements = 128; + + _cache = std::make_unique(_cache_dir, settings); + ASSERT_TRUE(_cache->initialize()); + ASSERT_TRUE(wait_for_condition([this]() { return _cache->get_async_open_success(); }, + std::chrono::seconds(5))); + } + + void TearDown() override { + _ttl_mgr.reset(); + _cache.reset(); + _meta_store.reset(); + + if (_fake_engine != nullptr) { + ExecEnv::GetInstance()->set_storage_engine(nullptr); + _fake_engine = nullptr; + } + + if (!_test_root.empty() && fs::exists(_test_root)) { + fs::remove_all(_test_root); + } + + config::file_cache_background_ttl_info_update_interval_ms = _orig_ttl_update_interval; + config::file_cache_background_ttl_gc_interval_ms = _orig_ttl_gc_interval; + config::file_cache_background_tablet_id_flush_interval_ms = _orig_tablet_flush_interval; + } + + FileBlockSPtr create_block(int64_t tablet_id, const std::string& cache_key, size_t offset, + size_t size, UInt128Wrapper* out_hash) { + auto hash = BlockFileCache::hash(cache_key); + if (out_hash != nullptr) { + *out_hash = hash; + } + + CacheContext context; + ReadStatistics stats; + context.stats = &stats; + context.cache_type = FileCacheType::NORMAL; + context.tablet_id = tablet_id; + + auto holder = _cache->get_or_set(hash, offset, size, context); + auto blocks = blocks_from_holder(holder); + + EXPECT_FALSE(blocks.empty()); + if (blocks.empty()) { + return nullptr; + } + // Some cache configurations may split the requested range into multiple + // file blocks. Pick the file block that contains the requested offset + // (the one the caller will start reading from). + auto it = std::find_if( + blocks.begin(), blocks.end(), [offset](const FileBlockSPtr& candidate) { + return candidate->range().left <= offset && candidate->range().right >= offset; + }); + EXPECT_NE(it, blocks.end()); + if (it == blocks.end()) { + return nullptr; + } + auto block = *it; + EXPECT_TRUE(block); + EXPECT_EQ(FileCacheType::NORMAL, block->cache_type()); + EXPECT_EQ(FileBlock::get_caller_id(), block->get_or_set_downloader()); + // Only append up to the selected file block's size. The requested + // range may be split into multiple file blocks, so appending the + // original requested size could overflow the selected block. + size_t write_size = block->range().size(); + std::string data(write_size, 'a'); + EXPECT_TRUE(block->append(Slice(data.data(), data.size())).ok()); + EXPECT_TRUE(block->finalize().ok()); + return block; + } + + void persist_block_meta(int64_t tablet_id, const UInt128Wrapper& hash, size_t offset, + size_t size) { + BlockMetaKey key {tablet_id, hash, offset}; + BlockMeta meta {FileCacheType::NORMAL, size, 0}; + _meta_store->put(key, meta); + ASSERT_TRUE(wait_for_condition([this, &key]() { return _meta_store->get(key).has_value(); }, + std::chrono::seconds(2))); + } + + FakeStorageEngine* fake_engine() const { return _fake_engine; } + + std::unique_ptr _meta_store; + std::unique_ptr _cache; + std::unique_ptr _ttl_mgr; + FakeStorageEngine* _fake_engine = nullptr; + +private: + fs::path _test_root; + std::string _cache_dir; + std::string _meta_dir; + int64_t _orig_ttl_update_interval = 0; + int64_t _orig_ttl_gc_interval = 0; + int64_t _orig_tablet_flush_interval = 0; +}; + +TEST_F(BlockFileCacheTtlMgrTest, BlocksSwitchToTtlWhenTabletHasTtl) { + constexpr int64_t kTabletId = 1001; + auto tablet = std::make_shared(UnixSeconds(), 60); + fake_engine()->add_tablet(kTabletId, tablet); + + UInt128Wrapper hash; + auto block = create_block(kTabletId, "ttl-tablet", 0, 1024, &hash); + persist_block_meta(kTabletId, hash, block->range().left, block->range().size()); + + _ttl_mgr = std::make_unique(_cache.get(), _meta_store.get()); + _ttl_mgr->register_tablet_id(kTabletId); + + ASSERT_TRUE(wait_for_condition([&]() { return block->cache_type() == FileCacheType::TTL; }, + std::chrono::seconds(5))); +} + +TEST_F(BlockFileCacheTtlMgrTest, ExpiredTabletMovesBlocksBackToNormal) { + constexpr int64_t kTabletId = 2002; + auto tablet = std::make_shared(UnixSeconds(), 120); + fake_engine()->add_tablet(kTabletId, tablet); + + UInt128Wrapper hash; + auto block = create_block(kTabletId, "ttl-expire", 0, 2048, &hash); + persist_block_meta(kTabletId, hash, block->range().left, block->range().size()); + + _ttl_mgr = std::make_unique(_cache.get(), _meta_store.get()); + _ttl_mgr->register_tablet_id(kTabletId); + + ASSERT_TRUE(wait_for_condition([&]() { return block->cache_type() == FileCacheType::TTL; }, + std::chrono::seconds(5))); + + tablet->set_creation_time(UnixSeconds() - 120); + tablet->set_ttl_seconds(1); + _ttl_mgr->register_tablet_id(kTabletId); + + ASSERT_TRUE(wait_for_condition([&]() { return block->cache_type() == FileCacheType::NORMAL; }, + std::chrono::seconds(5))); +} + +} // namespace doris::io diff --git a/regression-test/data/cloud_p0/cache/ttl/test_show_ttl.out b/regression-test/data/cloud_p0/cache/ttl/test_show_ttl.out index 5beec6cfd12160..543d7e19b73862 100644 --- a/regression-test/data/cloud_p0/cache/ttl/test_show_ttl.out +++ b/regression-test/data/cloud_p0/cache/ttl/test_show_ttl.out @@ -6,11 +6,11 @@ 0 -- !test_show_ttl_3 -- -customer_ttl CREATE TABLE `customer_ttl` (\n `C_CUSTKEY` int NOT NULL,\n `C_NAME` varchar(25) NOT NULL,\n `C_ADDRESS` varchar(40) NOT NULL,\n `C_NATIONKEY` int NOT NULL,\n `C_PHONE` char(15) NOT NULL,\n `C_ACCTBAL` decimal(15,2) NOT NULL,\n `C_MKTSEGMENT` char(10) NOT NULL,\n `C_COMMENT` varchar(117) NOT NULL\n) ENGINE=OLAP\nDUPLICATE KEY(`C_CUSTKEY`, `C_NAME`)\nDISTRIBUTED BY HASH(`C_CUSTKEY`) BUCKETS 32\nPROPERTIES (\n"file_cache_ttl_seconds" = "300",\n"is_being_synced" = "false",\n"storage_medium" = "hdd",\n"storage_format" = "V2",\n"inverted_index_storage_format" = "V2",\n"compression" = "ZSTD",\n"light_schema_change" = "true",\n"disable_auto_compaction" = "false",\n"enable_single_replica_compaction" = "false",\n"group_commit_interval_ms" = "10000",\n"group_commit_data_bytes" = "134217728"\n); +customer_ttl CREATE TABLE `customer_ttl` (\n `C_CUSTKEY` int NOT NULL,\n `C_NAME` varchar(25) NOT NULL,\n `C_ADDRESS` varchar(40) NOT NULL,\n `C_NATIONKEY` int NOT NULL,\n `C_PHONE` char(15) NOT NULL,\n `C_ACCTBAL` decimal(15,2) NOT NULL,\n `C_MKTSEGMENT` char(10) NOT NULL,\n `C_COMMENT` varchar(117) NOT NULL\n) ENGINE=OLAP\nDUPLICATE KEY(`C_CUSTKEY`, `C_NAME`)\nDISTRIBUTED BY HASH(`C_CUSTKEY`) BUCKETS 32\nPROPERTIES (\n"file_cache_ttl_seconds" = "300",\n"is_being_synced" = "false",\n"storage_medium" = "hdd",\n"storage_format" = "V2",\n"inverted_index_storage_format" = "V3",\n"compression" = "ZSTD",\n"light_schema_change" = "true",\n"disable_auto_compaction" = "false",\n"enable_single_replica_compaction" = "false",\n"group_commit_interval_ms" = "10000",\n"group_commit_data_bytes" = "134217728"\n); -- !test_show_ttl_4 -- 0 -- !test_show_ttl_5 -- -customer_ttl CREATE TABLE `customer_ttl` (\n `C_CUSTKEY` int NOT NULL,\n `C_NAME` varchar(25) NOT NULL,\n `C_ADDRESS` varchar(40) NOT NULL,\n `C_NATIONKEY` int NOT NULL,\n `C_PHONE` char(15) NOT NULL,\n `C_ACCTBAL` decimal(15,2) NOT NULL,\n `C_MKTSEGMENT` char(10) NOT NULL,\n `C_COMMENT` varchar(117) NOT NULL\n) ENGINE=OLAP\nDUPLICATE KEY(`C_CUSTKEY`, `C_NAME`)\nDISTRIBUTED BY HASH(`C_CUSTKEY`) BUCKETS 32\nPROPERTIES (\n"file_cache_ttl_seconds" = "0",\n"is_being_synced" = "false",\n"storage_medium" = "hdd",\n"storage_format" = "V2",\n"inverted_index_storage_format" = "V2",\n"compression" = "ZSTD",\n"light_schema_change" = "true",\n"disable_auto_compaction" = "false",\n"enable_single_replica_compaction" = "false",\n"group_commit_interval_ms" = "10000",\n"group_commit_data_bytes" = "134217728"\n); +customer_ttl CREATE TABLE `customer_ttl` (\n `C_CUSTKEY` int NOT NULL,\n `C_NAME` varchar(25) NOT NULL,\n `C_ADDRESS` varchar(40) NOT NULL,\n `C_NATIONKEY` int NOT NULL,\n `C_PHONE` char(15) NOT NULL,\n `C_ACCTBAL` decimal(15,2) NOT NULL,\n `C_MKTSEGMENT` char(10) NOT NULL,\n `C_COMMENT` varchar(117) NOT NULL\n) ENGINE=OLAP\nDUPLICATE KEY(`C_CUSTKEY`, `C_NAME`)\nDISTRIBUTED BY HASH(`C_CUSTKEY`) BUCKETS 32\nPROPERTIES (\n"file_cache_ttl_seconds" = "0",\n"is_being_synced" = "false",\n"storage_medium" = "hdd",\n"storage_format" = "V2",\n"inverted_index_storage_format" = "V3",\n"compression" = "ZSTD",\n"light_schema_change" = "true",\n"disable_auto_compaction" = "false",\n"enable_single_replica_compaction" = "false",\n"group_commit_interval_ms" = "10000",\n"group_commit_data_bytes" = "134217728"\n); diff --git a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_1.groovy b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_1.groovy index 8f5d070e091c39..f7252501b1ee1f 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_1.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_1.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("alter_ttl_1") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { @@ -67,41 +70,68 @@ suite("alter_ttl_1") { } } - def s3BucketName = getS3BucketName() - def s3WithProperties = """WITH S3 ( - |"AWS_ACCESS_KEY" = "${getS3AK()}", - |"AWS_SECRET_KEY" = "${getS3SK()}", - |"AWS_ENDPOINT" = "${getS3Endpoint()}", - |"AWS_REGION" = "${getS3Region()}", - |"provider" = "${getS3Provider()}") - |PROPERTIES( - |"exec_mem_limit" = "8589934592", - |"load_parallelism" = "3")""".stripMargin() + def getTabletIds = { String tableName -> + def tablets = sql "show tablets from ${tableName}" + assertTrue(tablets.size() > 0, "No tablets found for table ${tableName}") + tablets.collect { it[0] as Long } + } + def waitForFileCacheType = { List tabletIds, String expectedType, long timeoutMs = 120000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql "select type from information_schema.file_cache_info where tablet_id = ${tabletId}" + if (rows.isEmpty()) { + logger.warn("file_cache_info is empty for tablet ${tabletId} while waiting for ${expectedType}") + allMatch = false + break + } + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (mismatch) { + logger.info("tablet ${tabletId} has cache types ${rows.collect { it[0] }} while waiting for ${expectedType}") + allMatch = false + break + } + } + if (allMatch) { + logger.info("All file cache entries for tablets ${tabletIds} are ${expectedType}") + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for file_cache_info type ${expectedType} for tablets ${tabletIds}") + } sql new File("""${context.file.parent}/../ddl/customer_ttl_delete.sql""").text def load_customer_ttl_once = { String table -> - def uniqueID = Math.abs(UUID.randomUUID().hashCode()).toString() // def table = "customer" // create table if not exists sql (new File("""${context.file.parent}/../ddl/${table}.sql""").text + ttlProperties) sql """ alter table ${table} set ("disable_auto_compaction" = "true") """ // no influence from compaction - def loadLabel = table + "_" + uniqueID - // load data from cos - def loadSql = new File("""${context.file.parent}/../ddl/${table}_load.sql""").text.replaceAll("\\\$\\{s3BucketName\\}", s3BucketName) - loadSql = loadSql.replaceAll("\\\$\\{loadLabel\\}", loadLabel) + s3WithProperties - sql loadSql - - // check load state - while (true) { - def stateResult = sql "show load where Label = '${loadLabel}'" - def loadState = stateResult[stateResult.size() - 1][2].toString() - if ("CANCELLED".equalsIgnoreCase(loadState)) { - throw new IllegalStateException("load ${loadLabel} failed.") - } else if ("FINISHED".equalsIgnoreCase(loadState)) { - break + // insert rows until the dataset reaches ~100MB + def totalRows = 200 + def batchSize = 100 + def commentSuffix = ' ' + ('X' * 50) + for (int offset = 0; offset < totalRows; offset += batchSize) { + def sb = new StringBuilder() + int batchEnd = Math.min(totalRows, offset + batchSize) + for (int idx = offset; idx < batchEnd; idx++) { + def customerId = 10001 + idx + def customerName = String.format('Customer#%09d', customerId) + sb.append("""INSERT INTO ${table} VALUES ( + ${customerId}, + '${customerName}', + 'Address Line 1', + 15, + '123-456-7890', + 12345.67, + 'AUTOMOBILE', + 'This is a test comment for the customer.${commentSuffix}' + ); + """) } - sleep(5000) + sql sb.toString() } } @@ -111,6 +141,8 @@ suite("alter_ttl_1") { sleep(30000) load_customer_ttl_once("customer_ttl") + def tabletIds = getTabletIds.call("customer_ttl") + waitForFileCacheType.call(tabletIds, "ttl", 60000L) sleep(30000) long ttl_cache_size = 0 long normal_cache_size = 0 @@ -158,6 +190,7 @@ suite("alter_ttl_1") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line: " + line) assertEquals(line.substring(i).toLong(), 0) } @@ -167,11 +200,16 @@ suite("alter_ttl_1") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size: " + ttl_cache_size) + logger.info("normal_cache_size: " + normal_cache_size) + logger.info("new normal cache_size: " + line) assertEquals(line.substring(i).toLong(), ttl_cache_size + normal_cache_size) flag1 = true } } assertTrue(flag1) } + + waitForFileCacheType.call(tabletIds, "normal", 180000L) } } diff --git a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_2.groovy b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_2.groovy index ecf5541a6d4ef1..cb804ae3b625f5 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_2.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_2.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("alter_ttl_2") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { @@ -65,41 +68,65 @@ suite("alter_ttl_2") { } } - def s3BucketName = getS3BucketName() - def s3WithProperties = """WITH S3 ( - |"AWS_ACCESS_KEY" = "${getS3AK()}", - |"AWS_SECRET_KEY" = "${getS3SK()}", - |"AWS_ENDPOINT" = "${getS3Endpoint()}", - |"AWS_REGION" = "${getS3Region()}", - |"provider" = "${getS3Provider()}") - |PROPERTIES( - |"exec_mem_limit" = "8589934592", - |"load_parallelism" = "3")""".stripMargin() + def getTabletIds = { String tableName -> + def tablets = sql "show tablets from ${tableName}" + assertTrue(tablets.size() > 0, "No tablets found for table ${tableName}") + tablets.collect { it[0] as Long } + } + def waitForFileCacheType = { List tabletIds, String expectedType, long timeoutMs = 180000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql "select type from information_schema.file_cache_info where tablet_id = ${tabletId}" + if (rows.isEmpty()) { + logger.warn("file_cache_info is empty for tablet ${tabletId} while waiting for ${expectedType}") + allMatch = false + break + } + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (mismatch) { + logger.info("tablet ${tabletId} has cache types ${rows.collect { it[0] }} while waiting for ${expectedType}") + allMatch = false + break + } + } + if (allMatch) { + logger.info("All file cache entries for tablets ${tabletIds} are ${expectedType}") + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for file_cache_info type ${expectedType} for tablets ${tabletIds}") + } sql new File("""${context.file.parent}/../ddl/customer_ttl_delete.sql""").text def load_customer_ttl_once = { String table -> - def uniqueID = Math.abs(UUID.randomUUID().hashCode()).toString() - // def table = "customer" - // create table if not exists sql (new File("""${context.file.parent}/../ddl/${table}.sql""").text + ttlProperties) sql """ alter table ${table} set ("disable_auto_compaction" = "true") """ // no influence from compaction - def loadLabel = table + "_" + uniqueID - // load data from cos - def loadSql = new File("""${context.file.parent}/../ddl/${table}_load.sql""").text.replaceAll("\\\$\\{s3BucketName\\}", s3BucketName) - loadSql = loadSql.replaceAll("\\\$\\{loadLabel\\}", loadLabel) + s3WithProperties - sql loadSql - - // check load state - while (true) { - def stateResult = sql "show load where Label = '${loadLabel}'" - def loadState = stateResult[stateResult.size() - 1][2].toString() - if ("CANCELLED".equalsIgnoreCase(loadState)) { - throw new IllegalStateException("load ${loadLabel} failed.") - } else if ("FINISHED".equalsIgnoreCase(loadState)) { - break + def totalRows = 200 + def batchSize = 100 + def commentSuffix = ' ' + ('X' * 50) + for (int offset = 0; offset < totalRows; offset += batchSize) { + def sb = new StringBuilder() + int batchEnd = Math.min(totalRows, offset + batchSize) + for (int idx = offset; idx < batchEnd; idx++) { + def customerId = 10001 + idx + def customerName = String.format('Customer#%09d', customerId) + sb.append("""INSERT INTO ${table} VALUES ( + ${customerId}, + '${customerName}', + 'Address Line 1', + 15, + '123-456-7890', + 12345.67, + 'AUTOMOBILE', + 'This is a test comment for the customer.${commentSuffix}' + ); + """) } - sleep(5000) + sql sb.toString() } } @@ -120,6 +147,7 @@ suite("alter_ttl_2") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line before assert zero: " + line) assertEquals(line.substring(i).toLong(), 0) flag1 = true } @@ -128,6 +156,8 @@ suite("alter_ttl_2") { } load_customer_ttl_once("customer_ttl") + def tabletIds = getTabletIds.call("customer_ttl") + waitForFileCacheType.call(tabletIds, "ttl", 60000L) sql """ select count(*) from customer_ttl """ sleep(30000) long ttl_cache_size = 0 @@ -144,6 +174,7 @@ suite("alter_ttl_2") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line after load: " + line) ttl_cache_size = line.substring(i).toLong() flag1 = true } @@ -165,6 +196,7 @@ suite("alter_ttl_2") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line after ttl update: " + line) assertEquals(line.substring(i).toLong(), ttl_cache_size) flag1 = true } @@ -192,5 +224,7 @@ suite("alter_ttl_2") { } assertTrue(flag1) } + + waitForFileCacheType.call(tabletIds, "normal", 300000L) } } diff --git a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_3.groovy b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_3.groovy index ac60b45b3c31fb..4682176b158bb2 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_3.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_3.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("alter_ttl_3") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { @@ -65,41 +68,65 @@ suite("alter_ttl_3") { } } - def s3BucketName = getS3BucketName() - def s3WithProperties = """WITH S3 ( - |"AWS_ACCESS_KEY" = "${getS3AK()}", - |"AWS_SECRET_KEY" = "${getS3SK()}", - |"AWS_ENDPOINT" = "${getS3Endpoint()}", - |"AWS_REGION" = "${getS3Region()}", - |"provider" = "${getS3Provider()}") - |PROPERTIES( - |"exec_mem_limit" = "8589934592", - |"load_parallelism" = "3")""".stripMargin() + def getTabletIds = { String tableName -> + def tablets = sql "show tablets from ${tableName}" + assertTrue(tablets.size() > 0, "No tablets found for table ${tableName}") + tablets.collect { it[0] as Long } + } + def waitForFileCacheType = { List tabletIds, String expectedType, long timeoutMs = 120000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql "select type from information_schema.file_cache_info where tablet_id = ${tabletId}" + if (rows.isEmpty()) { + logger.warn("file_cache_info is empty for tablet ${tabletId} while waiting for ${expectedType}") + allMatch = false + break + } + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (mismatch) { + logger.info("tablet ${tabletId} has cache types ${rows.collect { it[0] }} while waiting for ${expectedType}") + allMatch = false + break + } + } + if (allMatch) { + logger.info("All file cache entries for tablets ${tabletIds} are ${expectedType}") + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for file_cache_info type ${expectedType} for tablets ${tabletIds}") + } sql new File("""${context.file.parent}/../ddl/customer_ttl_delete.sql""").text def load_customer_ttl_once = { String table -> - def uniqueID = Math.abs(UUID.randomUUID().hashCode()).toString() - // def table = "customer" - // create table if not exists sql (new File("""${context.file.parent}/../ddl/${table}.sql""").text + ttlProperties) - def loadLabel = table + "_" + uniqueID sql """ alter table ${table} set ("disable_auto_compaction" = "true") """ // no influence from compaction - // load data from cos - def loadSql = new File("""${context.file.parent}/../ddl/${table}_load.sql""").text.replaceAll("\\\$\\{s3BucketName\\}", s3BucketName) - loadSql = loadSql.replaceAll("\\\$\\{loadLabel\\}", loadLabel) + s3WithProperties - sql loadSql - - // check load state - while (true) { - def stateResult = sql "show load where Label = '${loadLabel}'" - def loadState = stateResult[stateResult.size() - 1][2].toString() - if ("CANCELLED".equalsIgnoreCase(loadState)) { - throw new IllegalStateException("load ${loadLabel} failed.") - } else if ("FINISHED".equalsIgnoreCase(loadState)) { - break + def totalRows = 200 + def batchSize = 100 + def commentSuffix = ' ' + ('X' * 50) + for (int offset = 0; offset < totalRows; offset += batchSize) { + def sb = new StringBuilder() + int batchEnd = Math.min(totalRows, offset + batchSize) + for (int idx = offset; idx < batchEnd; idx++) { + def customerId = 10001 + idx + def customerName = String.format('Customer#%09d', customerId) + sb.append("""INSERT INTO ${table} VALUES ( + ${customerId}, + '${customerName}', + 'Address Line 1', + 15, + '123-456-7890', + 12345.67, + 'AUTOMOBILE', + 'This is a test comment for the customer.${commentSuffix}' + ); + """) } - sleep(5000) + sql sb.toString() } } @@ -109,9 +136,13 @@ suite("alter_ttl_3") { sleep(30000) load_customer_ttl_once("customer_ttl") + def tabletIds = getTabletIds.call("customer_ttl") + // ttl=0 means data stays in normal cache until altered to a positive ttl + waitForFileCacheType.call(tabletIds, "normal", 60000L) sql """ select count(*) from customer_ttl """ sql """ ALTER TABLE customer_ttl SET ("file_cache_ttl_seconds"="3600") """ - sleep(80000) + sleep(160000) + waitForFileCacheType.call(tabletIds, "ttl", 60000L) getMetricsMethod.call() { respCode, body -> assertEquals("${respCode}".toString(), "200") @@ -127,6 +158,7 @@ suite("alter_ttl_3") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line after manual load: " + line) assertTrue(line.substring(i).toLong() > 0) flag1 = true } diff --git a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_seconds.groovy b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_seconds.groovy index 81d6894aeca823..fd5d72fb842ae7 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_seconds.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/alter_ttl_seconds.groovy @@ -18,7 +18,21 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("test_ttl_seconds") { - sql """ use @regression_cluster_name1 """ + def custoBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 + ] + + setBeConfigTemporary(custoBeConfig) { + sql "set global enable_auto_analyze = false" + sql "set global enable_audit_plugin = false" + def clusters = sql " SHOW CLUSTERS; " + assertTrue(!clusters.isEmpty()) + def validCluster = clusters[0][0] + sql """use @${validCluster};"""; def ttlProperties = """ PROPERTIES("file_cache_ttl_seconds"="5") """ String[][] backends = sql """ show backends """ String backendId; @@ -47,40 +61,32 @@ suite("test_ttl_seconds") { } } - def tables = [customer_ttl: 15000000] - def s3BucketName = getS3BucketName() - def s3WithProperties = """WITH S3 ( - |"AWS_ACCESS_KEY" = "${getS3AK()}", - |"AWS_SECRET_KEY" = "${getS3SK()}", - |"AWS_ENDPOINT" = "${getS3Endpoint()}", - |"AWS_REGION" = "${getS3Region()}", - |"provider" = "${getS3Provider()}") - |PROPERTIES( - |"exec_mem_limit" = "8589934592", - |"load_parallelism" = "3")""".stripMargin() - - sql new File("""${context.file.parent}/../ddl/customer_ttl_delete.sql""").text def load_customer_once = { String table -> - def uniqueID = Math.abs(UUID.randomUUID().hashCode()).toString() sql (new File("""${context.file.parent}/../ddl/${table}.sql""").text + ttlProperties) - def loadLabel = table + "_" + uniqueID sql """ alter table ${table} set ("disable_auto_compaction" = "true") """ // no influence from compaction - // load data from cos - def loadSql = new File("""${context.file.parent}/../ddl/${table}_load.sql""").text.replaceAll("\\\$\\{s3BucketName\\}", s3BucketName) - loadSql = loadSql.replaceAll("\\\$\\{loadLabel\\}", loadLabel) + s3WithProperties - sql loadSql - - // check load state - while (true) { - def stateResult = sql "show load where Label = '${loadLabel}'" - def loadState = stateResult[stateResult.size() - 1][2].toString() - if ("CANCELLED".equalsIgnoreCase(loadState)) { - throw new IllegalStateException("load ${loadLabel} failed.") - } else if ("FINISHED".equalsIgnoreCase(loadState)) { - break + def totalRows = 200 + def batchSize = 100 + def commentSuffix = ' ' + ('X' * 50) + for (int offset = 0; offset < totalRows; offset += batchSize) { + def sb = new StringBuilder() + int batchEnd = Math.min(totalRows, offset + batchSize) + for (int idx = offset; idx < batchEnd; idx++) { + def customerId = 10001 + idx + def customerName = String.format('Customer#%09d', customerId) + sb.append("""INSERT INTO ${table} VALUES ( + ${customerId}, + '${customerName}', + 'Address Line 1', + 15, + '123-456-7890', + 12345.67, + 'AUTOMOBILE', + 'This is a test comment for the customer.${commentSuffix}' + ); + """) } - sleep(5000) + sql sb.toString() } } @@ -93,12 +99,47 @@ suite("test_ttl_seconds") { } } + def getTabletIds = { String tableName -> + def tablets = sql "show tablets from ${tableName}" + assertTrue(tablets.size() > 0, "No tablets found for table ${tableName}") + tablets.collect { it[0] as Long } + } + + def waitForFileCacheType = { List tabletIds, String expectedType, long timeoutMs = 60000L, long intervalMs = 1000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql "select type from information_schema.file_cache_info where tablet_id = ${tabletId}" + if (rows.isEmpty()) { + logger.warn("file_cache_info is empty for tablet ${tabletId} while waiting for ${expectedType}") + allMatch = false + break + } + def mismatch = rows.find { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (mismatch) { + logger.info("tablet ${tabletId} has cache types ${rows.collect { it[0] }} while waiting for ${expectedType}") + allMatch = false + break + } + } + if (allMatch) { + logger.info("All file cache entries for tablets ${tabletIds} are ${expectedType}") + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for file_cache_info type ${expectedType} for tablets ${tabletIds}") + } + clearFileCache.call() { respCode, body -> {} } sleep(30000) load_customer_once("customer_ttl") + def tabletIds = getTabletIds.call("customer_ttl") + waitForFileCacheType.call(tabletIds, "ttl", 15000L, 500L) sleep(30000) // 30s getMetricsMethod.call() { respCode, body -> @@ -113,10 +154,14 @@ suite("test_ttl_seconds") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size after load: " + line) assertEquals(line.substring(i).toLong(), 0) flag1 = true } } assertTrue(flag1) } + + waitForFileCacheType.call(tabletIds, "normal", 60000L) + } } diff --git a/regression-test/suites/cloud_p0/cache/ttl/create_table_as_select.groovy b/regression-test/suites/cloud_p0/cache/ttl/create_table_as_select.groovy index bac2fe5eb08420..1a8db05b8146ea 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/create_table_as_select.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/create_table_as_select.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("create_table_as_select") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { diff --git a/regression-test/suites/cloud_p0/cache/ttl/create_table_like.groovy b/regression-test/suites/cloud_p0/cache/ttl/create_table_like.groovy index b6d8f505c905a6..3ca40024c9c820 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/create_table_like.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/create_table_like.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("create_table_like") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { diff --git a/regression-test/suites/cloud_p0/cache/ttl/test_ttl.groovy b/regression-test/suites/cloud_p0/cache/ttl/test_ttl.groovy index b4a668ef3f852e..09ee37aef07ba9 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/test_ttl.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/test_ttl.groovy @@ -20,7 +20,10 @@ import org.codehaus.groovy.runtime.IOGroovyMethods suite("test_ttl") { def custoBeConfig = [ enable_evict_file_cache_in_advance : false, - file_cache_enter_disk_resource_limit_mode_percent : 99 + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 ] setBeConfigTemporary(custoBeConfig) { @@ -61,44 +64,33 @@ suite("test_ttl") { } } - def tables = [customer_ttl: 15000000] - def s3BucketName = getS3BucketName() - def s3WithProperties = """WITH S3 ( - |"AWS_ACCESS_KEY" = "${getS3AK()}", - |"AWS_SECRET_KEY" = "${getS3SK()}", - |"AWS_ENDPOINT" = "${getS3Endpoint()}", - |"AWS_REGION" = "${getS3Region()}", - |"provider" = "${getS3Provider()}") - |PROPERTIES( - |"exec_mem_limit" = "8589934592", - |"load_parallelism" = "3")""".stripMargin() - - sql new File("""${context.file.parent}/../ddl/customer_ttl_delete.sql""").text def load_customer_once = { String table -> try { - def uniqueID = Math.abs(UUID.randomUUID().hashCode()).toString() sql (new File("""${context.file.parent}/../ddl/${table}.sql""").text + ttlProperties) - def loadLabel = table + "_" + uniqueID sql """ alter table ${table} set ("disable_auto_compaction" = "true") """ // no influence from compaction - // load data from cos - def loadSql = new File("""${context.file.parent}/../ddl/${table}_load.sql""").text.replaceAll("\\\$\\{s3BucketName\\}", s3BucketName) - loadSql = loadSql.replaceAll("\\\$\\{loadLabel\\}", loadLabel) + s3WithProperties - sql loadSql - - // check load state - while (true) { - def stateResult = sql "show load where Label = '${loadLabel}'" - def loadState = stateResult[stateResult.size() - 1][2].toString() - if ("CANCELLED".equalsIgnoreCase(loadState)) { - logger.error("Data load failed for label: ${loadLabel}") - throw new IllegalStateException("load ${loadLabel} failed.") - } else if ("FINISHED".equalsIgnoreCase(loadState)) { - logger.info("Data load completed successfully for label: ${loadLabel}") - break + def totalRows = 200 + def batchSize = 100 + def commentSuffix = ' ' + ('X' * 50) + for (int offset = 0; offset < totalRows; offset += batchSize) { + def sb = new StringBuilder() + int batchEnd = Math.min(totalRows, offset + batchSize) + for (int idx = offset; idx < batchEnd; idx++) { + def customerId = 10001 + idx + def customerName = String.format('Customer#%09d', customerId) + sb.append("""INSERT INTO ${table} VALUES ( + ${customerId}, + '${customerName}', + 'Address Line 1', + 15, + '123-456-7890', + 12345.67, + 'AUTOMOBILE', + 'This is a test comment for the customer.${commentSuffix}' + ); + """) } - logger.info("Waiting for data load to complete. Current state: ${loadState}") - sleep(5000) + sql sb.toString() } } catch (Exception e) { logger.error("Failed to load customer data: ${e.message}") @@ -120,13 +112,49 @@ suite("test_ttl") { } } + def getTabletIds = { String tableName -> + def tablets = sql "show tablets from ${tableName}" + assertTrue(tablets.size() > 0, "No tablets found for table ${tableName}") + tablets.collect { it[0] as Long } + } + + def waitForFileCacheType = { List tabletIds, String expectedType, long timeoutMs = 60000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean allMatch = true + for (Long tabletId in tabletIds) { + def rows = sql "select type from information_schema.file_cache_info where tablet_id = ${tabletId}" + if (rows.isEmpty()) { + logger.warn("file_cache_info is empty for tablet ${tabletId} while waiting for ${expectedType}") + allMatch = false + break + } + def mismatches = rows.findAll { row -> !row[0]?.toString()?.equalsIgnoreCase(expectedType) } + if (!mismatches.isEmpty()) { + logger.info("tablet ${tabletId} has cache types ${rows.collect { it[0] }} while waiting for ${expectedType}") + allMatch = false + break + } + } + if (allMatch) { + logger.info("All file cache entries for tablets ${tabletIds} are ${expectedType}") + return + } + sleep(intervalMs) + } + assertTrue(false, "Timeout waiting for file_cache_info type ${expectedType} for tablets ${tabletIds}") + } + clearFileCache.call() { respCode, body -> {} } sleep(10000) + def tabletIds = [] load_customer_once("customer_ttl") sleep(10000) + tabletIds = getTabletIds.call("customer_ttl") + waitForFileCacheType.call(tabletIds, "ttl") getMetricsMethod.call() { respCode, body -> assertEquals("${respCode}".toString(), "200") @@ -142,11 +170,12 @@ suite("test_ttl") { } def i = line.indexOf(' ') ttl_cache_size = line.substring(i).toLong() + logger.info("ttl_cache_size (initial) line: " + line) flag1 = true } } assertTrue(flag1) - assertTrue(ttl_cache_size > 838860800) + assertTrue(ttl_cache_size > 10737) } sleep(180000) getMetricsMethod.call() { @@ -162,11 +191,14 @@ suite("test_ttl") { continue } def i = line.indexOf(' ') + logger.info("ttl_cache_size line before assert zero: " + line) assertEquals(line.substring(i).toLong(), 0) flag1 = true } } assertTrue(flag1) } + + waitForFileCacheType.call(tabletIds, "normal") } } diff --git a/regression-test/suites/cloud_p0/cache/ttl/test_ttl_lru_evict.groovy b/regression-test/suites/cloud_p0/cache/ttl/test_ttl_lru_evict.groovy index 2b58a86a5afbe2..f452d071e04903 100644 --- a/regression-test/suites/cloud_p0/cache/ttl/test_ttl_lru_evict.groovy +++ b/regression-test/suites/cloud_p0/cache/ttl/test_ttl_lru_evict.groovy @@ -38,6 +38,17 @@ import org.apache.http.impl.client.LaxRedirectStrategy; // - set smaller max_ttl_cache_ratio in this test suite("test_ttl_lru_evict") { + def custoBeConfig = [ + enable_evict_file_cache_in_advance : false, + file_cache_enter_disk_resource_limit_mode_percent : 99, + file_cache_background_ttl_gc_interval_ms : 1000, + file_cache_background_ttl_info_update_interval_ms : 1000, + file_cache_background_tablet_id_flush_interval_ms : 1000 + ] + + setBeConfigTemporary(custoBeConfig) { + sql "set global enable_auto_analyze = false" + sql "set global enable_audit_plugin = false" def clusters = sql " SHOW CLUSTERS; " assertTrue(!clusters.isEmpty()) def validCluster = clusters[0][0] @@ -337,4 +348,5 @@ suite("test_ttl_lru_evict") { } assertTrue(flag1) } + } } diff --git a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy index 6a48706718c3ba..ff8753a857e4af 100644 --- a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy +++ b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy @@ -52,6 +52,26 @@ suite('test_clean_tablet_when_drop_force_table', 'docker') { "Expected to find log line with queue_size=0 in ${beLogPath}, but none matched.") log.info("found queue_size=0 log line: {}", queueZeroLine) } + + def waitForTabletCacheState = { Collection tabletIds, boolean expectPresent, long timeoutMs = 60000L, long intervalMs = 2000L -> + long start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < timeoutMs) { + boolean conditionMet = tabletIds.every { Long tabletId -> + def rows = sql "select tablet_id from information_schema.file_cache_info where tablet_id = ${tabletId}" + expectPresent ? !rows.isEmpty() : rows.isEmpty() + } + if (conditionMet) { + return + } + sleep(intervalMs) + } + def stillPresent = tabletIds.findAll { Long tabletId -> !(sql "select tablet_id from information_schema.file_cache_info where tablet_id = ${tabletId}").isEmpty() } + if (expectPresent) { + assertTrue(false, "Tablet cache info never appeared for tablet ids ${stillPresent}") + } else { + assertTrue(false, "Tablet cache info still exists for tablet ids ${stillPresent}") + } + } def testCase = { tableName, waitTime, useDp=false-> def ms = cluster.getAllMetaservices().get(0) @@ -143,6 +163,8 @@ suite('test_clean_tablet_when_drop_force_table', 'docker') { assertTrue(beforeGetFromBe.containsKey(it.Key)) assertEquals(beforeGetFromBe[it.Key], it.Value[1]) } + def tabletIds = beforeGetFromFe.keySet() + waitForTabletCacheState.call(tabletIds, true, 90000L) if (useDp) { GetDebugPoint().enableDebugPointForAllBEs("WorkPoolCloudDropTablet.drop_tablet_callback.failed") } @@ -206,6 +228,8 @@ suite('test_clean_tablet_when_drop_force_table', 'docker') { String beLogPath = cluster.getBeByIndex(1).getLogFilePath() checkBeLog(beLogPath) + + waitForTabletCacheState.call(tabletIds, false, 90000L) } docker(options) { From 5eed93cd93fdb3d195da0102f6631537a0f28e05 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Wed, 10 Dec 2025 19:29:36 +0800 Subject: [PATCH 08/20] [fix](filecache) fix clear directly rm meta dir Signed-off-by: zhengyu --- be/src/io/cache/fs_file_cache_storage.cpp | 1 + .../block_file_cache_test_meta_store.cpp | 81 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index b2055b3640e7f9..c42560075d46b7 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -956,6 +956,7 @@ Status FSFileCacheStorage::clear(std::string& msg) { auto t0 = std::chrono::steady_clock::now(); for (; key_it != std::filesystem::directory_iterator(); ++key_it) { if (!key_it->is_directory()) continue; // all file cache data is in sub-directories + if (key_it->path().filename().native() == "meta") continue; ++total; std::string cache_key = key_it->path().string(); auto st = global_local_filesystem()->delete_directory(cache_key); diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index 3fc652b466e7be..d057f57cfecaf9 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -414,6 +414,87 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { } } +TEST_F(BlockFileCacheTest, clear_retains_meta_directory_and_clears_meta_entries) { + config::enable_evict_file_cache_in_advance = false; + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + + io::FileCacheSettings settings; + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 5000000; + settings.query_queue_elements = 50000; + settings.index_queue_size = 5000000; + settings.index_queue_elements = 50000; + settings.disposable_queue_size = 5000000; + settings.disposable_queue_elements = 50000; + settings.capacity = 20000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + for (int i = 0; i < 100; i++) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context; + ReadStatistics rstats; + context.stats = &rstats; + context.cache_type = io::FileCacheType::NORMAL; + context.query_id.hi = 1; + context.query_id.lo = 2; + context.tablet_id = 314; + auto key = io::BlockFileCache::hash("meta_clear_key"); + + auto holder = cache.get_or_set(key, 0, 100000, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + assert_range(1, blocks[0], io::FileBlock::Range(0, 99999), io::FileBlock::State::EMPTY); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0]); + assert_range(2, blocks[0], io::FileBlock::Range(0, 99999), io::FileBlock::State::DOWNLOADED); + blocks.clear(); + + auto* fs_storage = dynamic_cast(cache._storage.get()); + ASSERT_NE(fs_storage, nullptr) << "Expected FSFileCacheStorage but got different storage type"; + auto& meta_store = fs_storage->_meta_store; + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + verify_meta_key(*meta_store, context.tablet_id, "meta_clear_key", 0, FileCacheType::NORMAL, 0, + 100000); + + cache.clear_file_cache_directly(); + + std::string meta_dir = cache.get_base_path() + "/meta"; + ASSERT_TRUE(fs::exists(meta_dir)); + ASSERT_TRUE(fs::is_directory(meta_dir)); + + BlockMetaKey mkey(context.tablet_id, key, 0); + auto meta = meta_store->get(mkey); + ASSERT_FALSE(meta.has_value()); + + auto iterator = meta_store->get_all(); + if (iterator != nullptr) { + bool has_entry = false; + for (; iterator->valid(); iterator->next()) { + has_entry = true; + break; + } + ASSERT_FALSE(has_entry) << "Meta store still contains entries after clearing cache"; + } + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + //TODO(zhengyu): check lazy load //TODO(zhengyu): check version2 start //TODO(zhengyu): check version2 version3 mixed start From c74d2ec3821fe94ddd6a1a4f31af8de70cf41875 Mon Sep 17 00:00:00 2001 From: freemandealer Date: Wed, 10 Dec 2025 22:17:55 +0800 Subject: [PATCH 09/20] fix beut Signed-off-by: freemandealer --- be/test/io/cache/block_file_cache_test_common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/be/test/io/cache/block_file_cache_test_common.h b/be/test/io/cache/block_file_cache_test_common.h index 920461a1551160..f45614cf8a4880 100644 --- a/be/test/io/cache/block_file_cache_test_common.h +++ b/be/test/io/cache/block_file_cache_test_common.h @@ -98,6 +98,8 @@ class BlockFileCacheTest : public testing::Test { public: static void SetUpTestSuite() { config::file_cache_enter_disk_resource_limit_mode_percent = 99; + config::file_cache_background_ttl_gc_interval_ms = 2000; + config::file_cache_background_ttl_info_update_interval_ms = 2000; config::enable_evict_file_cache_in_advance = false; // disable evict in // advance for most // cases for simple From 6b3773cf3ca29d5e3dd55bf6c9d0cf60ec2206b9 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Wed, 10 Dec 2025 22:25:50 +0800 Subject: [PATCH 10/20] rm invalid beut Signed-off-by: zhengyu --- be/test/io/cache/block_file_cache_test.cpp | 149 --------------------- 1 file changed, 149 deletions(-) diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index c3a92445c72780..9917f131338bb7 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -5301,155 +5301,6 @@ TEST_F(BlockFileCacheTest, file_cache_path_storage_parse) { } } -//TODO(zhengyu): should be compatible with version3 format -TEST_F(BlockFileCacheTest, DISABLE_check_file_cache_consistency) { - GTEST_SKIP(); - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.index_queue_size = 30; - settings.index_queue_elements = 5; - settings.disposable_queue_size = 30; - settings.disposable_queue_elements = 5; - settings.capacity = 90; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - auto key1 = io::BlockFileCache::hash("key1"); - auto key2 = io::BlockFileCache::hash("key2"); - - io::BlockFileCache mgr(cache_base_path, settings); - ASSERT_TRUE(mgr.initialize()); - for (int i = 0; i < 100; i++) { - if (mgr.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - io::CacheContext cache_context; - ReadStatistics rstats; - cache_context.stats = &rstats; - cache_context.cache_type = io::FileCacheType::TTL; - cache_context.query_id = query_id; - cache_context.expiration_time = 0; - { - cache_context.cache_type = io::FileCacheType::NORMAL; - auto holder = mgr.get_or_set(key1, 0, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - std::vector result; - Status status = mgr.report_file_cache_inconsistency(result); - ASSERT_TRUE(result.empty()); - } - - { - auto holder = mgr.get_or_set(key1, 10, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(10, 18), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(10, 18), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - mgr._files[key1].erase(10); - } - - { - auto holder = mgr.get_or_set(key1, 20, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(20, 28), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(20, 28), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - auto* fs_file_cache_storage = dynamic_cast(mgr._storage.get()); - std::string dir_path = fs_file_cache_storage->get_path_in_local_cache_v2(key1, 0); - fs::path block_file_path = std::filesystem::path(dir_path) / "20"; - fs::remove(block_file_path); - } - - { - auto holder = mgr.get_or_set(key1, 30, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(30, 38), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(30, 38), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - auto* fs_file_cache_storage = dynamic_cast(mgr._storage.get()); - std::string dir_path = fs_file_cache_storage->get_path_in_local_cache_v2(key1, 0); - fs::path block_file_path = std::filesystem::path(dir_path) / "30"; - std::string data = "This is a test message."; - std::ofstream out_file(block_file_path, std::ios::out | std::ios::app); - out_file << data; - out_file.close(); - } - - { - auto holder = mgr.get_or_set(key1, 40, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(40, 48), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(40, 48), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - blocks[0]->_key.meta.type = io::FileCacheType::INDEX; - } - - int64_t expiration_time = 120; - { - cache_context.cache_type = FileCacheType::TTL; - cache_context.expiration_time = expiration_time; - auto holder = mgr.get_or_set(key2, 0, 9, cache_context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - assert_range(2, blocks[0], io::FileBlock::Range(0, 8), io::FileBlock::State::DOWNLOADING); - download(blocks[0]); - blocks[0]->_key.meta.expiration_time = 0; - } - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - std::vector results; - Status status = mgr.report_file_cache_inconsistency(results); - std::unordered_set expected_results = { - "File cache info in manager:\nHash: 62434304659ae12df53386481113dfe1\nExpiration Time: " - "0\nOffset: 0\nCache Type: ttl\nFile cache info in storage:\nHash: " - "62434304659ae12df53386481113dfe1\nExpiration Time: " + - std::to_string(expiration_time) + - "\nOffset: 0\nCache Type: " - "ttl\nInconsistency Reason: EXPIRATION_TIME_INCONSISTENT \n\n", - "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " - "0\nOffset: 30\nCache Type: normal\nFile cache info in storage:\nHash: " - "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 30\nCache Type: " - "normal\nInconsistency Reason: SIZE_INCONSISTENT \n\n", - "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " - "0\nOffset: 40\nCache Type: index\nFile cache info in storage:\nHash: " - "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 40\nCache Type: " - "normal\nInconsistency Reason: CACHE_TYPE_INCONSISTENT \n\n", - "File cache info in manager:\nHash: 00000000000000000000000000000000\nExpiration Time: " - "0\nOffset: 0\nCache Type: normal\nFile cache info in storage:\nHash: " - "f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: 0\nOffset: 10\nCache Type: " - "normal\nInconsistency Reason: NOT_LOADED \n\n", - "File cache info in manager:\nHash: f36131fb4ba563c17e727cd0cdd63689\nExpiration Time: " - "0\nOffset: 20\nCache Type: normal\nFile cache info in storage:\nHash: " - "00000000000000000000000000000000\nExpiration Time: 0\nOffset: 0\nCache Type: " - "normal\nInconsistency Reason: MISSING_IN_STORAGE \n\n"}; - ASSERT_EQ(results.size(), expected_results.size()); - for (const auto& result : results) { - ASSERT_TRUE(expected_results.contains(result)); - } -} - TEST_F(BlockFileCacheTest, populate_empty_cache_with_disposable) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); From 3d2af232a51d4bcf2942193bb39e61cb508ba73d Mon Sep 17 00:00:00 2001 From: freemandealer Date: Thu, 18 Dec 2025 23:06:49 +0800 Subject: [PATCH 11/20] fix observation Signed-off-by: freemandealer --- .../apache/doris/planner/BackendPartitionedSchemaScanNode.java | 1 + 1 file changed, 1 insertion(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/BackendPartitionedSchemaScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/BackendPartitionedSchemaScanNode.java index 042dc5e4851aaa..f7610480efa587 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/BackendPartitionedSchemaScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/BackendPartitionedSchemaScanNode.java @@ -70,6 +70,7 @@ public class BackendPartitionedSchemaScanNode extends SchemaScanNode { BEACKEND_ID_COLUMN_SET.add("be_id"); BACKEND_TABLE.add("file_cache_statistics"); + BACKEND_TABLE.add("file_cache_info"); BACKEND_TABLE.add("backend_kerberos_ticket_cache"); BACKEND_TABLE.add("backend_tablets"); From 68090281a19041c819957782db94305065c35fa2 Mon Sep 17 00:00:00 2001 From: freemandealer Date: Tue, 16 Dec 2025 23:06:51 +0800 Subject: [PATCH 12/20] fix beuts Signed-off-by: freemandealer --- be/src/cloud/cloud_tablet.cpp | 13 - be/src/io/cache/block_file_cache.cpp | 15 - be/src/io/cache/block_file_cache.h | 3 +- be/src/io/cache/file_block.cpp | 13 - be/src/io/cache/file_block.h | 2 - be/src/io/cache/file_cache_storage.h | 2 - be/src/io/cache/fs_file_cache_storage.cpp | 20 - be/src/io/cache/fs_file_cache_storage.h | 2 - be/src/io/cache/mem_file_cache_storage.cpp | 7 - be/src/io/cache/mem_file_cache_storage.h | 2 - be/test/io/cache/block_file_cache_test.cpp | 543 +----------------- .../block_file_cache_test_meta_store.cpp | 11 +- 12 files changed, 13 insertions(+), 620 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 05950e1a3a2d61..b1c422127e181b 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -1525,19 +1525,6 @@ Status CloudTablet::sync_meta() { return st; } - auto new_ttl_seconds = tablet_meta->ttl_seconds(); - if (_tablet_meta->ttl_seconds() != new_ttl_seconds) { - _tablet_meta->set_ttl_seconds(new_ttl_seconds); - std::shared_lock rlock(_meta_lock); - for (auto& [_, rs] : _rs_version_map) { - for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { - auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); - auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); - file_cache->modify_expiration_time(file_key, new_ttl_seconds); - } - } - } - auto new_compaction_policy = tablet_meta->compaction_policy(); if (_tablet_meta->compaction_policy() != new_compaction_policy) { _tablet_meta->set_compaction_policy(new_compaction_policy); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 4a9364b7290ff2..6baab442ab3269 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -2158,21 +2158,6 @@ std::map BlockFileCache::get_blocks_by_key(const UInt128W return offset_to_block; } -void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, uint64_t expiration_time) { - SCOPED_CACHE_LOCK(_mutex, this); - if (auto iter = _files.find(hash); iter != _files.end()) { - for (auto& [_, cell] : iter->second) { - if (cell.file_block) { - auto st = cell.file_block->update_expiration_time(expiration_time); - if (!st.ok()) { - LOG(WARNING) << "Failed to update expiration time for block " - << cell.file_block->get_info_for_log() << ", error=" << st; - } - } - } - } -} - void BlockFileCache::update_ttl_atime(const UInt128Wrapper& hash) { SCOPED_CACHE_LOCK(_mutex, this); if (auto iter = _files.find(hash); iter != _files.end()) { diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 6ce37a4794ce4c..cfa37a44919728 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -250,8 +250,7 @@ class BlockFileCache { std::string reset_capacity(size_t new_capacity); std::map get_blocks_by_key(const UInt128Wrapper& hash); - /// Adjust expiration time for every block sharing the specified hash key. - void modify_expiration_time(const UInt128Wrapper& hash, uint64_t expiration_time); + /// For debug and UT std::string dump_structure(const UInt128Wrapper& hash); std::string dump_single_cache_type(const UInt128Wrapper& hash, size_t offset); diff --git a/be/src/io/cache/file_block.cpp b/be/src/io/cache/file_block.cpp index fd5c3a478f1dae..299888ab2e1c3c 100644 --- a/be/src/io/cache/file_block.cpp +++ b/be/src/io/cache/file_block.cpp @@ -187,19 +187,6 @@ Status FileBlock::change_cache_type_lock(FileCacheType new_type, return Status::OK(); } -Status FileBlock::update_expiration_time(uint64_t expiration_time) { - std::lock_guard block_lock(_mutex); - if (_download_state == State::DOWNLOADED) { - auto st = _mgr->_storage->change_key_meta_expiration(_key, expiration_time, - _block_range.size()); - if (!st.ok()) { - return st; - } - } - _key.meta.expiration_time = expiration_time; - return Status::OK(); -} - FileBlock::State FileBlock::wait() { std::unique_lock block_lock(_mutex); diff --git a/be/src/io/cache/file_block.h b/be/src/io/cache/file_block.h index 9ccb9a958707f2..bfe9a69ac9f5e7 100644 --- a/be/src/io/cache/file_block.h +++ b/be/src/io/cache/file_block.h @@ -127,8 +127,6 @@ class FileBlock { [[nodiscard]] Status change_cache_type_lock(FileCacheType new_type, std::lock_guard&); - [[nodiscard]] Status update_expiration_time(uint64_t expiration_time); - uint64_t expiration_time() const { return _key.meta.expiration_time; } std::string get_cache_file() const; diff --git a/be/src/io/cache/file_cache_storage.h b/be/src/io/cache/file_cache_storage.h index 226001c7109afe..39b02315d59bf8 100644 --- a/be/src/io/cache/file_cache_storage.h +++ b/be/src/io/cache/file_cache_storage.h @@ -58,8 +58,6 @@ class FileCacheStorage { // change the block meta virtual Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, const size_t size) = 0; - virtual Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, - const size_t size) = 0; // use when lazy load cache virtual void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) {} diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index c42560075d46b7..b1fc8cdefc56be 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -272,17 +272,6 @@ Status FSFileCacheStorage::change_key_meta_type(const FileCacheKey& key, const F return Status::OK(); } -Status FSFileCacheStorage::change_key_meta_expiration(const FileCacheKey& key, - const uint64_t expiration, - const size_t size) { - if (key.meta.expiration_time != expiration) { - BlockMetaKey mkey(key.meta.tablet_id, UInt128Wrapper(key.hash), key.offset); - BlockMeta meta(key.meta.type, size, expiration); - _meta_store->put(mkey, meta); - } - return Status::OK(); -} - std::string FSFileCacheStorage::get_path_in_local_cache_v3(const std::string& dir, size_t offset, bool is_tmp) { if (is_tmp) { @@ -772,7 +761,6 @@ Status FSFileCacheStorage::get_file_cache_infos(std::vector& info } void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const { - TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile1"); int scan_length = 10000; std::vector batch_load_buffer; batch_load_buffer.reserve(scan_length); @@ -786,14 +774,6 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mg if (block->tablet_id() == 0) { block->set_tablet_id(args.ctx.tablet_id); } - if (block->cache_type() == io::FileCacheType::TTL && - block->expiration_time() != args.ctx.expiration_time) { - auto s = block->update_expiration_time(args.ctx.expiration_time); - if (!s.ok()) { - LOG(WARNING) << "update expiration time for " << args.hash.to_string() - << " offset=" << args.offset; - } - } return; } _mgr->add_cell(args.hash, args.ctx, args.offset, args.size, diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index 4a45726186c1ca..ea5695438c6090 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -68,8 +68,6 @@ class FSFileCacheStorage : public FileCacheStorage { Status remove(const FileCacheKey& key) override; Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, const size_t size) override; - Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, - const size_t size) override; void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; diff --git a/be/src/io/cache/mem_file_cache_storage.cpp b/be/src/io/cache/mem_file_cache_storage.cpp index b9af32a22acda4..01548fa50baa02 100644 --- a/be/src/io/cache/mem_file_cache_storage.cpp +++ b/be/src/io/cache/mem_file_cache_storage.cpp @@ -110,13 +110,6 @@ Status MemFileCacheStorage::change_key_meta_type(const FileCacheKey& key, const return Status::OK(); } -Status MemFileCacheStorage::change_key_meta_expiration(const FileCacheKey& key, - const uint64_t expiration, - const size_t size) { - // do nothing for in memory cache coz nothing to persist - return Status::OK(); -} - void MemFileCacheStorage::load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) { diff --git a/be/src/io/cache/mem_file_cache_storage.h b/be/src/io/cache/mem_file_cache_storage.h index 8773e629e03c1d..a1c77d03828ffb 100644 --- a/be/src/io/cache/mem_file_cache_storage.h +++ b/be/src/io/cache/mem_file_cache_storage.h @@ -41,8 +41,6 @@ class MemFileCacheStorage : public FileCacheStorage { Status remove(const FileCacheKey& key) override; Status change_key_meta_type(const FileCacheKey& key, const FileCacheType type, const size_t size) override; - Status change_key_meta_expiration(const FileCacheKey& key, const uint64_t expiration, - const size_t size) override; void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index 9917f131338bb7..343a3ea07fb9e3 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -1882,6 +1882,7 @@ TEST_F(BlockFileCacheTest, run_in_resource_limit_mode) { } } +#if 0 // load from meta store won't trigger this test TEST_F(BlockFileCacheTest, fix_tmp_file) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -1942,6 +1943,7 @@ TEST_F(BlockFileCacheTest, fix_tmp_file) { fs::remove_all(cache_base_path); } } +#endif TEST_F(BlockFileCacheTest, test_async_load) { if (fs::exists(cache_base_path)) { @@ -2101,7 +2103,6 @@ TEST_F(BlockFileCacheTest, ttl_normal) { context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; context.expiration_time = 120; - int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2134,7 +2135,7 @@ TEST_F(BlockFileCacheTest, ttl_normal) { } { context.cache_type = io::FileCacheType::INDEX; - context.expiration_time = 0; + context.expiration_time = 100; auto holder = cache.get_or_set(key2, 60, 10, context); /// Add range [60, 69] auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); @@ -2144,27 +2145,6 @@ TEST_F(BlockFileCacheTest, ttl_normal) { assert_range(1, blocks[0], io::FileBlock::Range(60, 69), io::FileBlock::State::DOWNLOADED); EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); } - { - cache.modify_expiration_time(key2, modify_time); - context.expiration_time = modify_time; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - EXPECT_EQ(blocks[0]->expiration_time(), modify_time); - } - std::this_thread::sleep_for(std::chrono::seconds(10)); - { - context.cache_type = io::FileCacheType::INDEX; - context.expiration_time = 0; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(blocks[0]->expiration_time(), 0); - std::string buffer(10, '1'); - ASSERT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } @@ -2199,7 +2179,6 @@ TEST_F(BlockFileCacheTest, ttl_modify) { context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; context.expiration_time = 120; - int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2230,27 +2209,6 @@ TEST_F(BlockFileCacheTest, ttl_modify) { assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); } - cache.modify_expiration_time(key2, 0); - { - context.cache_type = io::FileCacheType::INDEX; - context.expiration_time = 0; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(blocks[0]->expiration_time(), 0); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } - { - cache.modify_expiration_time(key2, modify_time); - context.expiration_time = modify_time; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - EXPECT_EQ(blocks[0]->expiration_time(), modify_time); - } if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } @@ -2280,7 +2238,6 @@ TEST_F(BlockFileCacheTest, ttl_modify_memory_storage) { context.cache_type = io::FileCacheType::TTL; context.query_id = query_id; context.expiration_time = 120; - int64_t modify_time = 5; auto key1 = io::BlockFileCache::hash("key5"); auto key2 = io::BlockFileCache::hash("key6"); io::BlockFileCache cache(cache_base_path, settings); @@ -2311,276 +2268,11 @@ TEST_F(BlockFileCacheTest, ttl_modify_memory_storage) { assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); } - cache.modify_expiration_time(key2, 0); - { - context.cache_type = io::FileCacheType::INDEX; - context.expiration_time = 0; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(blocks[0]->expiration_time(), 0); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } - { - cache.modify_expiration_time(key2, modify_time); - context.expiration_time = modify_time; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - EXPECT_EQ(blocks[0]->expiration_time(), modify_time); - } -} - -TEST_F(BlockFileCacheTest, ttl_change_to_normal) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - test_file_cache(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.ttl_queue_size = 30; - settings.ttl_queue_elements = 5; - settings.capacity = 60; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 180; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(blocks[0]); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - } - { - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - if (auto storage = dynamic_cast(cache._storage.get()); - storage != nullptr) { - std::string dir = storage->get_path_in_local_cache_v2(key2, 0); - EXPECT_TRUE(fs::exists( - storage->get_path_in_local_cache_v2(dir, 50, io::FileCacheType::NORMAL))); - } - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(blocks[0]->expiration_time(), 0); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - -TEST_F(BlockFileCacheTest, ttl_change_to_normal_memory_storage) { - test_file_cache_memory_storage(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.capacity = 30; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - settings.storage = "memory"; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 180; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download_into_memory(blocks[0]); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - } - { - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(blocks[0]->expiration_time(), 0); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } -} - -TEST_F(BlockFileCacheTest, ttl_change_expiration_time) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - test_file_cache(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.ttl_queue_size = 30; - settings.ttl_queue_elements = 5; - settings.capacity = 60; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 180; - int64_t change_time = 120; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - // std::cout << "current cache size:" << cache.get_used_cache_size() << std::endl; - std::cout << "cache capacity:" << cache.capacity() << std::endl; - auto map = cache.get_stats_unsafe(); - for (auto& [key, value] : map) { - std::cout << key << " : " << value << std::endl; - } - auto key1 = io::BlockFileCache::hash("key1"); - std::cout << cache.dump_structure(key1) << std::endl; - std::cout << cache.dump_structure(key2) << std::endl; - - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(blocks[0]); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - } - { - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = change_time; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - if (auto storage = dynamic_cast(cache._storage.get()); - storage != nullptr) { - std::string dir = storage->get_path_in_local_cache_v3(key2); - EXPECT_TRUE(fs::exists(storage->get_path_in_local_cache_v3(dir, 50))); - } - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(blocks[0]->expiration_time(), change_time); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); } } -TEST_F(BlockFileCacheTest, ttl_change_expiration_time_memory_storage) { - test_file_cache_memory_storage(io::FileCacheType::NORMAL); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.capacity = 30; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - settings.storage = "memory"; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 180; - int64_t change_time = 120; - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download_into_memory(blocks[0]); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - } - { - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = change_time; - auto holder = cache.get_or_set(key2, 50, 10, context); /// Add range [50, 59] - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(50, 59), io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(blocks[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(blocks[0]->expiration_time(), change_time); - std::string buffer(10, '1'); - EXPECT_TRUE(blocks[0]->read(Slice(buffer.data(), 10), 0).ok()); - EXPECT_EQ(buffer, std::string(10, '0')); - } -} - TEST_F(BlockFileCacheTest, io_error) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -2799,69 +2491,6 @@ TEST_F(BlockFileCacheTest, remove_directly_when_normal_change_to_ttl) { } } -TEST_F(BlockFileCacheTest, ttl_gc) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - auto sp = SyncPoint::get_instance(); - SyncPoint::CallbackGuard guard1; - sp->enable_processing(); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 50; - settings.query_queue_elements = 5; - settings.ttl_queue_size = 500; - settings.ttl_queue_elements = 500; - settings.capacity = 100; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - - config::file_cache_background_ttl_gc_batch = 6; - config::file_cache_background_ttl_gc_interval_ms = - 3000; // make it big enough to disable auto ttl_gc - - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 2; - - for (int64_t i = 0; i < 12; ++i) { - auto key = io::BlockFileCache::hash(fmt::format("key{}", i)); - auto holder = cache.get_or_set(key, 0, 5, context); - auto blocks = fromHolder(holder); - ASSERT_EQ(blocks.size(), 1); - assert_range(1, blocks[0], io::FileBlock::Range(0, 4), io::FileBlock::State::EMPTY); - ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(blocks[0]); - assert_range(1, blocks[0], io::FileBlock::Range(0, 4), io::FileBlock::State::DOWNLOADED); - } - ASSERT_EQ(cache._time_to_key.size(), 12); - - std::this_thread::sleep_for(std::chrono::milliseconds(3000)); - ASSERT_GT(cache._time_to_key.size(), 0); - - std::this_thread::sleep_for(std::chrono::milliseconds(3000)); - ASSERT_EQ(cache._time_to_key.size(), 0); - - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - TEST_F(BlockFileCacheTest, recyle_cache_async) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -4206,6 +3835,7 @@ TEST_F(BlockFileCacheTest, test_hot_data) { EXPECT_EQ(cache.get_hot_blocks_meta(key2).size(), 1); } +#if 0 // load from meta store won't trigger this test TEST_F(BlockFileCacheTest, test_async_load_with_error_file_1) { if (fs::exists(cache_base_path)) { fs::remove_all(cache_base_path); @@ -4363,6 +3993,7 @@ TEST_F(BlockFileCacheTest, test_async_load_with_error_file_2) { fs::remove_all(cache_base_path); } } +#endif TEST_F(BlockFileCacheTest, test_check_disk_reource_limit_1) { if (fs::exists(cache_base_path)) { @@ -4925,170 +4556,6 @@ TEST_F(BlockFileCacheTest, reset_capacity) { } } -TEST_F(BlockFileCacheTest, change_cache_type1) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - auto sp = SyncPoint::get_instance(); - sp->set_call_back("FileBlock::change_cache_type", [](auto&& args) { - *try_any_cast(args[0]) = Status::IOError("inject io error"); - }); - sp->enable_processing(); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.capacity = 30; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.cache_type = io::FileCacheType::TTL; - context.query_id = query_id; - context.expiration_time = 120; - int64_t modify_time = 5; - auto key1 = io::BlockFileCache::hash("key1"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(segments[0]->expiration_time(), context.expiration_time); - } - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(segments[0]->expiration_time(), 0); - } - sp->clear_call_back("FileBlock::change_cache_type"); - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = modify_time; - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(segments[0]->expiration_time(), modify_time); - } - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - -TEST_F(BlockFileCacheTest, change_cache_type2) { - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } - fs::create_directories(cache_base_path); - auto sp = SyncPoint::get_instance(); - sp->set_call_back("FileBlock::change_cache_type", [](auto&& args) { - *try_any_cast(args[0]) = Status::IOError("inject io error"); - }); - sp->enable_processing(); - TUniqueId query_id; - query_id.hi = 1; - query_id.lo = 1; - io::FileCacheSettings settings; - settings.query_queue_size = 30; - settings.query_queue_elements = 5; - settings.capacity = 30; - settings.max_file_block_size = 30; - settings.max_query_cache_size = 30; - io::CacheContext context; - ReadStatistics rstats; - context.stats = &rstats; - context.query_id = query_id; - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - auto key1 = io::BlockFileCache::hash("key1"); - auto key2 = io::BlockFileCache::hash("key2"); - io::BlockFileCache cache(cache_base_path, settings); - ASSERT_TRUE(cache.initialize()); - for (int i = 0; i < 100; i++) { - if (cache.get_async_open_success()) { - break; - }; - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(segments[0]->expiration_time(), 0); - } - context.cache_type = io::FileCacheType::TTL; - context.expiration_time = 120; - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::TTL); - EXPECT_EQ(segments[0]->expiration_time(), context.expiration_time); - } - sp->clear_call_back("FileBlock::change_cache_type"); - context.cache_type = io::FileCacheType::NORMAL; - context.expiration_time = 0; - { - auto holder = cache.get_or_set(key1, 50, 10, context); /// Add range [50, 59] - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(50, 59), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(segments[0]->expiration_time(), 0); - } - EXPECT_EQ(cache._normal_queue.queue.size(), 1); - for (int64_t offset = 0; offset < 40; offset += 5) { - auto holder = cache.get_or_set(key2, offset, 5, context); - auto segments = fromHolder(holder); - ASSERT_EQ(segments.size(), 1); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::EMPTY); - ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); - download(segments[0]); - assert_range(1, segments[0], io::FileBlock::Range(offset, offset + 4), - io::FileBlock::State::DOWNLOADED); - EXPECT_EQ(segments[0]->cache_type(), io::FileCacheType::NORMAL); - EXPECT_EQ(segments[0]->expiration_time(), 0); - } - if (fs::exists(cache_base_path)) { - fs::remove_all(cache_base_path); - } -} - /* TEST_F(BlockFileCacheTest, load_cache1) { if (fs::exists(cache_base_path)) { diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index d057f57cfecaf9..48e565cd8eee92 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -24,9 +24,13 @@ namespace doris::io { namespace { +static size_t verify_meta_key_cnt = 0; + void verify_meta_key(CacheBlockMetaStore& meta_store, int64_t tablet_id, const std::string& key_name, size_t offset, FileCacheType expected_type, uint64_t ttl, size_t size) { + verify_meta_key_cnt++; + std::cout << "verify_meta_key called " << verify_meta_key_cnt << " times" << std::endl; BlockMetaKey mkey(tablet_id, io::BlockFileCache::hash(key_name), offset); auto meta = meta_store.get(mkey); ASSERT_TRUE(meta.has_value()); @@ -349,7 +353,6 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { ASSERT_EQ(blocks.size(), 1); auto block = blocks[0]; ASSERT_EQ(block->tablet_id(), 49); - ASSERT_EQ(block->expiration_time(), expiration_time); } // do some meta change - type @@ -394,7 +397,6 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { auto blocks = fromHolder(holder); ASSERT_EQ(blocks.size(), 1); auto block = blocks[0]; - ASSERT_EQ(block->expiration_time(), expiration_time + 3600); } // check the meta { @@ -404,8 +406,9 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { << "Expected FSFileCacheStorage but got different storage type"; auto& meta_store = fs_storage->_meta_store; - verify_meta_key(*meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time + 3600, - 100000); + verify_meta_key( + *meta_store, 49, "key3", 0, FileCacheType::TTL, expiration_time, + 100000); // won't change ttl when get_or_set now as we introduce ttl mgr to manage ttl } } From 9756d6df79bd510dd6cdefc121b1dec483af818a Mon Sep 17 00:00:00 2001 From: zhengyu Date: Fri, 19 Dec 2025 11:49:37 +0800 Subject: [PATCH 13/20] Remove duplicate cache schema change hit ratio definition Removed duplicate definition of file_cache_keep_schema_change_output_min_hit_ratio. --- be/src/common/config.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index c54385ca2fc076..cc5aa22a184a7c 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1141,12 +1141,10 @@ DEFINE_mInt64(cache_lock_held_long_tail_threshold_us, "30000000"); DEFINE_mBool(enable_file_cache_keep_base_compaction_output, "false"); DEFINE_mBool(enable_file_cache_adaptive_write, "true"); DEFINE_mDouble(file_cache_keep_base_compaction_output_min_hit_ratio, "0.7"); -// if difference below this threshold, we consider cache's progressive upgrading (2.0->3.0) successful -DEFINE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold, "0.3"); +DEFINE_mDouble(file_cache_keep_schema_change_output_min_hit_ratio, "0.7"); // if difference below this threshold, we consider cache's progressive upgrading (2.0->3.0) successful DEFINE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold, "0.3"); -DEFINE_mDouble(file_cache_keep_schema_change_output_min_hit_ratio, "0.7"); DEFINE_mInt64(file_cache_remove_block_qps_limit, "1000"); DEFINE_mInt64(file_cache_background_gc_interval_ms, "100"); From f3ed82663aaa5e26b56448a9acf36410c44a4024 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Mon, 22 Dec 2025 19:16:33 +0800 Subject: [PATCH 14/20] [fix](filecache) fix fs iterator concurrency problem Signed-off-by: zhengyu --- be/src/io/cache/fs_file_cache_storage.cpp | 43 +++++++++++-- .../block_file_cache_test_meta_store.cpp | 64 +++++++++++++++++++ 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index b1fc8cdefc56be..884c07d5584c4c 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include "common/logging.h" #include "cpp/sync_point.h" @@ -980,13 +981,47 @@ size_t FSFileCacheStorage::estimate_file_count_from_statfs() const { // Get total size of cache directory to estimate file count std::error_code ec; uintmax_t total_size = 0; - for (const auto& entry : std::filesystem::recursive_directory_iterator(_cache_base_path, ec)) { + std::vector pending_dirs {std::filesystem::path(_cache_base_path)}; + while (!pending_dirs.empty()) { + auto current_dir = pending_dirs.back(); + pending_dirs.pop_back(); + + std::filesystem::directory_iterator it(current_dir, ec); if (ec) { - LOG(WARNING) << "Error accessing directory entry: " << ec.message(); + LOG(WARNING) << "Failed to list directory while estimating file count, dir=" + << current_dir << ", err=" << ec.message(); + ec.clear(); continue; } - if (entry.is_regular_file()) { - total_size += entry.file_size(); + + for (; it != std::filesystem::directory_iterator(); ++it) { + std::error_code status_ec; + auto entry_status = it->symlink_status(status_ec); + if (status_ec) { + LOG(WARNING) << "Failed to stat entry while estimating file count, path=" + << it->path() << ", err=" << status_ec.message(); + continue; + } + + if (std::filesystem::is_directory(entry_status)) { + auto next_dir = it->path(); + TEST_SYNC_POINT_CALLBACK( + "FSFileCacheStorage::estimate_file_count_from_statfs::OnDirectory", + &next_dir); + pending_dirs.emplace_back(next_dir); + continue; + } + + if (std::filesystem::is_regular_file(entry_status)) { + std::error_code size_ec; + auto file_size = it->file_size(size_ec); + if (size_ec) { + LOG(WARNING) << "Failed to get file size while estimating file count, path=" + << it->path() << ", err=" << size_ec.message(); + continue; + } + total_size += file_size; + } } } diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index 48e565cd8eee92..390cd585cf8cc5 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -18,7 +18,20 @@ // https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/tests/gtest_lru_file_cache.cpp // and modified by Doris +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wkeyword-macro" +#endif + +#define private public +#define protected public #include "block_file_cache_test_common.h" +#undef private +#undef protected + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif namespace doris::io { @@ -498,6 +511,57 @@ TEST_F(BlockFileCacheTest, clear_retains_meta_directory_and_clears_meta_entries) } } +TEST_F(BlockFileCacheTest, estimate_file_count_skips_removed_directory) { + std::string test_dir = cache_base_path + "/estimate_file_count_removed_dir"; + if (fs::exists(test_dir)) { + fs::remove_all(test_dir); + } + auto keep_dir = fs::path(test_dir) / "keep"; + auto remove_dir = fs::path(test_dir) / "remove"; + fs::create_directories(keep_dir); + fs::create_directories(remove_dir); + + auto keep_file = keep_dir / "data.bin"; + std::string one_mb(1024 * 1024, 'd'); + { + std::ofstream ofs(keep_file, std::ios::binary); + ASSERT_TRUE(ofs.good()); + for (int i = 0; i < 3; ++i) { + ofs.write(one_mb.data(), one_mb.size()); + ASSERT_TRUE(ofs.good()); + } + } + + FSFileCacheStorage storage; + storage._cache_base_path = test_dir; + + const std::string sync_point_name = + "FSFileCacheStorage::estimate_file_count_from_statfs::OnDirectory"; + auto* sync_point = doris::SyncPoint::get_instance(); + doris::SyncPoint::CallbackGuard guard(sync_point_name); + sync_point->set_call_back( + sync_point_name, + [remove_dir](std::vector&& args) { + auto* path = doris::try_any_cast(args[0]); + if (*path == remove_dir) { + fs::remove_all(remove_dir); + } + }, + &guard); + sync_point->enable_processing(); + + size_t estimated_files = storage.estimate_file_count_from_statfs(); + + sync_point->disable_processing(); + + ASSERT_EQ(3, estimated_files); + ASSERT_FALSE(fs::exists(remove_dir)); + + if (fs::exists(test_dir)) { + fs::remove_all(test_dir); + } +} + //TODO(zhengyu): check lazy load //TODO(zhengyu): check version2 start //TODO(zhengyu): check version2 version3 mixed start From 42344bf03d5327baedb2d540e84172c2620dd1aa Mon Sep 17 00:00:00 2001 From: zhengyu Date: Wed, 24 Dec 2025 11:03:20 +0800 Subject: [PATCH 15/20] fix reset_range not update shadow queue cause cache size large Signed-off-by: zhengyu --- be/src/io/cache/block_file_cache.cpp | 26 +++++-- be/src/io/cache/file_cache_common.h | 2 + be/src/io/cache/fs_file_cache_storage.cpp | 41 ++++++++--- be/src/io/cache/fs_file_cache_storage.h | 4 ++ be/src/io/cache/lru_queue_recorder.cpp | 9 +++ be/src/io/cache/lru_queue_recorder.h | 3 +- be/src/util/runtime_profile.h | 8 +-- .../block_file_cache_test_meta_store.cpp | 70 +++++++++++++++++++ be/test/io/cache/lru_queue_test.cpp | 11 +++ 9 files changed, 157 insertions(+), 17 deletions(-) diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 6baab442ab3269..9be2d84cc010e3 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -857,6 +857,17 @@ FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& hash, const CacheC return nullptr; /// Empty files are not cached. } + VLOG_DEBUG << "Adding file block to cache. size=" << size << " hash=" << hash.to_string() + << " offset=" << offset << " cache_type=" << cache_type_to_string(context.cache_type) + << " expiration_time=" << context.expiration_time + << " tablet_id=" << context.tablet_id; + + if (size > 1024 * 1024 * 1024) { + LOG(WARNING) << "File block size is too large for a block. size=" << size + << " hash=" << hash.to_string() << " offset=" << offset + << " stack:" << get_stack_trace(); + } + auto& offsets = _files[hash]; auto itr = offsets.find(offset); if (itr != offsets.end()) { @@ -1201,10 +1212,10 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size if (cell->queue_iterator) { auto& queue = get_queue(cell->file_block->cache_type()); DCHECK(queue.contains(hash, offset, cache_lock)); - auto iter = queue.get(hash, offset, cache_lock); - iter->size = new_size; - queue.cache_size -= old_size; - queue.cache_size += new_size; + queue.resize(*cell->queue_iterator, new_size, cache_lock); + _lru_recorder->record_queue_event(cell->file_block->cache_type(), CacheLRULogType::RESIZE, + cell->file_block->get_hash_value(), + cell->file_block->offset(), new_size); } _cur_cache_size -= old_size; _cur_cache_size += new_size; @@ -1501,6 +1512,13 @@ void LRUQueue::remove_all(std::lock_guard& /* cache_lock */) { void LRUQueue::move_to_end(Iterator queue_it, std::lock_guard& /* cache_lock */) { queue.splice(queue.end(), queue, queue_it); } + +void LRUQueue::resize(Iterator queue_it, size_t new_size, + std::lock_guard& /* cache_lock */) { + cache_size -= queue_it->size; + queue_it->size = new_size; + cache_size += new_size; +} bool LRUQueue::contains(const UInt128Wrapper& hash, size_t offset, std::lock_guard& /* cache_lock */) const { return map.find(std::make_pair(hash, offset)) != map.end(); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 02b759efbca728..306f9a71f9e38c 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -231,6 +231,8 @@ class LRUQueue { void move_to_end(Iterator queue_it, std::lock_guard& cache_lock); + void resize(Iterator queue_it, size_t new_size, std::lock_guard& cache_lock); + std::string to_string(std::lock_guard& cache_lock) const; bool contains(const UInt128Wrapper& hash, size_t offset, diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 884c07d5584c4c..5ab1e699a053fe 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -580,6 +580,31 @@ Status FSFileCacheStorage::parse_filename_suffix_to_cache_type( return Status::OK(); } +bool FSFileCacheStorage::handle_already_loaded_block( + BlockFileCache* mgr, const UInt128Wrapper& hash, size_t offset, size_t new_size, + int64_t tablet_id, std::lock_guard& cache_lock) const { + auto file_it = mgr->_files.find(hash); + if (file_it == mgr->_files.end()) { + return false; + } + + auto cell_it = file_it->second.find(offset); + if (cell_it == file_it->second.end()) { + return false; + } + + auto block = cell_it->second.file_block; + if (tablet_id != 0 && block->tablet_id() == 0) { + block->set_tablet_id(tablet_id); + } + + size_t old_size = block->range().size(); + if (old_size != new_size) { + mgr->reset_range(hash, offset, old_size, new_size, cache_lock); + } + return true; +} + void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const { int scan_length = 10000; std::vector batch_load_buffer; @@ -589,8 +614,8 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mg auto f = [&](const BatchLoadArgs& args) { // in async load mode, a cell may be added twice. - if (_mgr->_files.contains(args.hash) && _mgr->_files[args.hash].contains(args.offset)) { - // TODO(zhengyu): update type&expiration if need + if (handle_already_loaded_block(_mgr, args.hash, args.offset, args.size, + args.ctx.tablet_id, cache_lock)) { return; } // if the file is tmp, it means it is the old file and it should be removed @@ -770,11 +795,8 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mg auto f = [&](const BatchLoadArgs& args) { // in async load mode, a cell may be added twice. - if (_mgr->_files.contains(args.hash) && _mgr->_files[args.hash].contains(args.offset)) { - auto block = _mgr->_files[args.hash][args.offset].file_block; - if (block->tablet_id() == 0) { - block->set_tablet_id(args.ctx.tablet_id); - } + if (handle_already_loaded_block(_mgr, args.hash, args.offset, args.size, + args.ctx.tablet_id, cache_lock)) { return; } _mgr->add_cell(args.hash, args.ctx, args.offset, args.size, @@ -917,7 +939,10 @@ void FSFileCacheStorage::load_blocks_directly_unlocked(BlockFileCache* mgr, cons context_original.cache_type = static_cast(block_meta->type); context_original.tablet_id = key.meta.tablet_id; - if (!mgr->_files.contains(key.hash) || !mgr->_files[key.hash].contains(key.offset)) { + if (handle_already_loaded_block(mgr, key.hash, key.offset, block_meta->size, key.meta.tablet_id, + cache_lock)) { + return; + } else { mgr->add_cell(key.hash, context_original, key.offset, block_meta->size, FileBlock::State::DOWNLOADED, cache_lock); } diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index ea5695438c6090..d486552d2b62d7 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -111,6 +111,10 @@ class FSFileCacheStorage : public FileCacheStorage { void load_cache_info_into_memory(BlockFileCache* _mgr) const; + bool handle_already_loaded_block(BlockFileCache* mgr, const UInt128Wrapper& hash, size_t offset, + size_t new_size, int64_t tablet_id, + std::lock_guard& cache_lock) const; + private: // Helper function to count files in cache directory using statfs size_t estimate_file_count_from_statfs() const; diff --git a/be/src/io/cache/lru_queue_recorder.cpp b/be/src/io/cache/lru_queue_recorder.cpp index 8308a2a73ad6e3..9907e58cb2a607 100644 --- a/be/src/io/cache/lru_queue_recorder.cpp +++ b/be/src/io/cache/lru_queue_recorder.cpp @@ -62,6 +62,15 @@ void LRUQueueRecorder::replay_queue_event(FileCacheType type) { } break; } + case CacheLRULogType::RESIZE: { + auto it = shadow_queue.get(log->hash, log->offset, lru_log_lock); + if (it != std::list::iterator()) { + shadow_queue.resize(it, log->size, lru_log_lock); + } else { + LOG(WARNING) << "RESIZE failed, doesn't exist in shadow queue"; + } + break; + } default: LOG(WARNING) << "Unknown CacheLRULogType: " << static_cast(log->type); break; diff --git a/be/src/io/cache/lru_queue_recorder.h b/be/src/io/cache/lru_queue_recorder.h index 1f6d69493cf4a8..5bd68b70d555f9 100644 --- a/be/src/io/cache/lru_queue_recorder.h +++ b/be/src/io/cache/lru_queue_recorder.h @@ -31,7 +31,8 @@ enum class CacheLRULogType { ADD = 0, // all of the integer types REMOVE = 1, MOVETOBACK = 2, - INVALID = 3, + RESIZE = 3, + INVALID = 4, }; struct CacheLRULog { diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h index a9ccb0910b66c6..79c0c1538c5659 100644 --- a/be/src/util/runtime_profile.h +++ b/be/src/util/runtime_profile.h @@ -395,7 +395,7 @@ class RuntimeProfile { int64_t level = 2, int64_t condition = 0, int64_t value = 0) : Counter(type, value, level), _condition(condition), - _value(value), + _stored_value(value), _condition_func(condition_func) {} Counter* clone() const override { @@ -405,13 +405,13 @@ class RuntimeProfile { int64_t value() const override { std::lock_guard l(_mutex); - return _value; + return _stored_value; } void conditional_update(int64_t c, int64_t v) { std::lock_guard l(_mutex); if (_condition_func(_condition, c)) { - _value = v; + _stored_value = v; _condition = c; } } @@ -419,7 +419,7 @@ class RuntimeProfile { private: mutable std::mutex _mutex; int64_t _condition; - int64_t _value; + int64_t _stored_value; ConditionCounterFunction _condition_func; }; diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index 390cd585cf8cc5..e33bef8e8536cd 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -511,6 +511,76 @@ TEST_F(BlockFileCacheTest, clear_retains_meta_directory_and_clears_meta_entries) } } +TEST_F(BlockFileCacheTest, HandleAlreadyLoadedBlockUpdatesSizeAndTablet) { + config::enable_evict_file_cache_in_advance = false; + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + + io::FileCacheSettings settings; + settings.ttl_queue_size = 5000000; + settings.ttl_queue_elements = 50000; + settings.query_queue_size = 5000000; + settings.query_queue_elements = 50000; + settings.index_queue_size = 5000000; + settings.index_queue_elements = 50000; + settings.disposable_queue_size = 5000000; + settings.disposable_queue_elements = 50000; + settings.capacity = 20000000; + settings.max_file_block_size = 100000; + settings.max_query_cache_size = 30; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + for (int i = 0; i < 100; ++i) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + io::CacheContext context; + ReadStatistics rstats; + context.stats = &rstats; + context.cache_type = io::FileCacheType::NORMAL; + context.query_id.hi = 11; + context.query_id.lo = 12; + context.tablet_id = 0; + auto key = io::BlockFileCache::hash("sync_cached_block_meta_key"); + + constexpr size_t kOriginalSize = 100000; + auto holder = cache.get_or_set(key, 0, kOriginalSize, context); + auto blocks = fromHolder(holder); + ASSERT_EQ(blocks.size(), 1); + ASSERT_TRUE(blocks[0]->get_or_set_downloader() == io::FileBlock::get_caller_id()); + download(blocks[0], kOriginalSize); + blocks.clear(); + + auto* fs_storage = dynamic_cast(cache._storage.get()); + ASSERT_NE(fs_storage, nullptr) << "Expected FSFileCacheStorage but got different storage type"; + + constexpr size_t kNewSize = 2 * kOriginalSize; + constexpr int64_t kTabletId = 4242; + bool handled = false; + { + SCOPED_CACHE_LOCK(cache._mutex, (&cache)); + handled = fs_storage->handle_already_loaded_block(&cache, key, 0, kNewSize, kTabletId, + cache_lock); + } + + ASSERT_TRUE(handled); + auto& cell = cache._files[key][0]; + EXPECT_EQ(cell.file_block->tablet_id(), kTabletId); + EXPECT_EQ(cache._cur_cache_size, kNewSize); + EXPECT_EQ(cache._normal_queue.get_capacity_unsafe(), kNewSize); + + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } +} + TEST_F(BlockFileCacheTest, estimate_file_count_skips_removed_directory) { std::string test_dir = cache_base_path + "/estimate_file_count_removed_dir"; if (fs::exists(test_dir)) { diff --git a/be/test/io/cache/lru_queue_test.cpp b/be/test/io/cache/lru_queue_test.cpp index 4a01fb27e3dcfe..2a9cdc3a6bc672 100644 --- a/be/test/io/cache/lru_queue_test.cpp +++ b/be/test/io/cache/lru_queue_test.cpp @@ -115,3 +115,14 @@ TEST_F(LRUQueueTest, SameElementsDifferentOrder) { EXPECT_EQ(queue1->levenshtein_distance_from(*queue2, lock), 2); } + +TEST_F(LRUQueueTest, ResizeUpdatesCacheSize) { + std::mutex mutex; + std::lock_guard lock(mutex); + + auto iter = queue1->add(UInt128Wrapper(123), 0, 1024, lock); + EXPECT_EQ(queue1->get_capacity(lock), 1024); + + queue1->resize(iter, 2048, lock); + EXPECT_EQ(queue1->get_capacity(lock), 2048); +} From f47bc9e1ebaa66293ef37c45517ed3a5eee22504 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Wed, 24 Dec 2025 11:14:51 +0800 Subject: [PATCH 16/20] fix calc bug Signed-off-by: zhengyu --- be/src/io/cache/fs_file_cache_storage.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 5ab1e699a053fe..96454db5e28ccc 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -897,8 +897,8 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const // If the difference is more than threshold, load from filesystem as well if (estimated_file_count > 100) { double difference_ratio = - static_cast(estimated_file_count) - - static_cast(db_block_count) / static_cast(estimated_file_count); + (static_cast(estimated_file_count) - static_cast(db_block_count)) / + static_cast(estimated_file_count); if (difference_ratio > config::file_cache_meta_store_vs_file_system_diff_num_threshold) { LOG(WARNING) << "Significant difference between DB blocks (" << db_block_count From c9fc497ff9afd9fb0c4cfaa544e47eedd63460e6 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Mon, 12 Jan 2026 11:26:10 +0800 Subject: [PATCH 17/20] [fix](filecache) add OFFSET column for table file_cache_info (#59645) --- .../schema_file_cache_info_scanner.cpp | 23 +++++++++++-------- .../org/apache/doris/catalog/SchemaTable.java | 1 + .../cache/test_file_cache_info.groovy | 5 ++++ 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp index 9734dbfe44bbf7..09f1be4f3754db 100644 --- a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp @@ -30,6 +30,7 @@ namespace doris { std::vector SchemaFileCacheInfoScanner::_s_tbls_columns = { // name, type, size, is_null {"HASH", TYPE_STRING, sizeof(StringRef), true}, + {"OFFSET", TYPE_BIGINT, sizeof(int64_t), true}, {"TABLET_ID", TYPE_BIGINT, sizeof(int64_t), true}, {"SIZE", TYPE_BIGINT, sizeof(int64_t), true}, {"TYPE", TYPE_STRING, sizeof(StringRef), true}, @@ -68,7 +69,7 @@ Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) { } // Collect all cache entries from all file cache instances - std::vector> cache_entries; + std::vector> cache_entries; // Get all cache instances using the public getter const auto& caches = file_cache_factory->get_caches(); @@ -116,7 +117,8 @@ Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) { std::string hash_str = key.hash.to_string(); // Add to cache entries - cache_entries.emplace_back(hash_str, key.tablet_id, value.size, value.type, cache_path); + cache_entries.emplace_back(hash_str, static_cast(key.offset), key.tablet_id, + static_cast(value.size), value.type, cache_path); iterator->next(); } @@ -137,21 +139,21 @@ Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) { for (size_t row_idx = 0; row_idx < row_num; ++row_idx) { const auto& entry = cache_entries[row_idx]; - const auto& [hash, tablet_id, size, type, cache_path] = entry; + const auto& [hash, offset, tablet_id, size, type, cache_path] = entry; if (col_desc.type == TYPE_STRING) { switch (col_idx) { case 0: // HASH column_values[row_idx] = hash; break; - case 3: // TYPE + case 4: // TYPE column_values[row_idx] = doris::io::cache_type_to_string( static_cast(type)); break; - case 4: // REMOTE_PATH + case 5: // REMOTE_PATH column_values[row_idx] = ""; // TODO: Implement remote path retrieval break; - case 5: // CACHE_PATH + case 6: // CACHE_PATH column_values[row_idx] = cache_path; break; default: @@ -163,13 +165,16 @@ Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) { datas[row_idx] = &str_refs[row_idx]; } else if (col_desc.type == TYPE_BIGINT) { switch (col_idx) { - case 1: // TABLET_ID + case 1: // OFFSET + int64_vals[row_idx] = offset; + break; + case 2: // TABLET_ID int64_vals[row_idx] = tablet_id; break; - case 2: // SIZE + case 3: // SIZE int64_vals[row_idx] = size; break; - case 6: // BE_ID + case 7: // BE_ID int64_vals[row_idx] = ExecEnv::GetInstance()->cluster_info()->backend_id; break; default: diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java index 14a941ea7b9b6b..faae2c75098f21 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java @@ -702,6 +702,7 @@ public class SchemaTable extends Table { .put("file_cache_info", new SchemaTable(SystemIdGenerator.getNextId(), "file_cache_info", TableType.SCHEMA, builder().column("HASH", ScalarType.createStringType()) + .column("OFFSET", ScalarType.createType(PrimitiveType.BIGINT)) .column("TABLET_ID", ScalarType.createType(PrimitiveType.BIGINT)) .column("SIZE", ScalarType.createType(PrimitiveType.BIGINT)) .column("TYPE", ScalarType.createStringType()) diff --git a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy index 8dfca55e4855f4..7c8841f4544077 100644 --- a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy +++ b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy @@ -83,6 +83,11 @@ suite("test_file_cache_info") { def tablet_id = get_tablet_id("customer") println "Tablet ID: ${tablet_id}" + def desc_cache_info = sql "desc information_schema.file_cache_info" + assertTrue(desc_cache_info.size() > 0, "desc information_schema.file_cache_info should not be empty") + assertEquals(desc_cache_info[0][0].toString().toUpperCase(), "HASH") + assertEquals(desc_cache_info[1][0].toString().toUpperCase(), "OFFSET") + def cache_info = sql "select * from information_schema.file_cache_info" assertTrue(cache_info.size() > 0, "file_cache_info should not be empty for tablet_id ${tablet_id}") From abe37b6bda8006bf687cdbb8fc7a30accc2701c7 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 3 Feb 2026 17:00:35 +0800 Subject: [PATCH 18/20] [fix](filecache) correct ttl metrics (#60252) add changing type to Normal when TTL type & TTL set to 0 to make ttl queueu size correct --- be/src/io/cache/block_file_cache_ttl_mgr.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.cpp b/be/src/io/cache/block_file_cache_ttl_mgr.cpp index 082ac9623ffd18..d1406d08325873 100644 --- a/be/src/io/cache/block_file_cache_ttl_mgr.cpp +++ b/be/src/io/cache/block_file_cache_ttl_mgr.cpp @@ -180,6 +180,7 @@ void BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map() { } // Update TTL info map + bool need_convert_from_ttl = false; { std::lock_guard lock(_ttl_info_mutex); if (ttl > 0) { @@ -204,6 +205,19 @@ void BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map() { } else { // Remove from TTL map if TTL is 0 _ttl_info_map.erase(tablet_id); + need_convert_from_ttl = true; + } + } + + if (need_convert_from_ttl) { + FileBlocks blocks = get_file_blocks_from_tablet_id(tablet_id); + for (auto& block : blocks) { + if (block->cache_type() == FileCacheType::TTL) { + auto st = block->change_cache_type(FileCacheType::NORMAL); + if (!st.ok()) { + LOG(WARNING) << "Failed to convert block back to NORMAL cache_type"; + } + } } } } From 180300d6655de809ba5c0a054c330b114a82c170 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 3 Feb 2026 17:07:20 +0800 Subject: [PATCH 19/20] [fix](filecache) add ttl mgr NOT_FOUND cleanup (#60269) 1. Drop missing tablet ids on NOT_FOUND to stop repeated TTL meta lookups and log spam 2. add bvar for tablet-id set size to monitor cleanup behavior --- be/src/io/cache/block_file_cache_ttl_mgr.cpp | 24 ++++++++++++++++++-- be/src/io/cache/block_file_cache_ttl_mgr.h | 4 ++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.cpp b/be/src/io/cache/block_file_cache_ttl_mgr.cpp index d1406d08325873..65ca767dd938da 100644 --- a/be/src/io/cache/block_file_cache_ttl_mgr.cpp +++ b/be/src/io/cache/block_file_cache_ttl_mgr.cpp @@ -26,6 +26,7 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "io/cache/block_file_cache.h" #include "io/cache/cache_block_meta_store.h" #include "io/cache/file_block.h" @@ -37,6 +38,8 @@ namespace doris::io { BlockFileCacheTtlMgr::BlockFileCacheTtlMgr(BlockFileCache* mgr, CacheBlockMetaStore* meta_store) : _mgr(mgr), _meta_store(meta_store), _stop_background(false) { + _tablet_id_set_size_metrics = std::make_shared>( + _mgr->get_base_path().c_str(), "file_cache_ttl_mgr_tablet_id_set_size", 0); // Start background threads _update_ttl_thread = std::thread(&BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map, this); @@ -79,6 +82,9 @@ void BlockFileCacheTtlMgr::run_background_tablet_id_flush() { } std::lock_guard lock(_tablet_id_mutex); _tablet_id_set.insert(items->begin(), items->end()); + if (_tablet_id_set_size_metrics) { + _tablet_id_set_size_metrics->set_value(_tablet_id_set.size()); + } items->clear(); }; @@ -166,8 +172,22 @@ void BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map() { TabletMetaSharedPtr tablet_meta; auto meta_status = ExecEnv::get_tablet_meta(tablet_id, &tablet_meta, false); if (!meta_status.ok()) { - LOG(WARNING) << "Failed to get tablet meta for tablet_id: " << tablet_id - << ", err: " << meta_status; + if (meta_status.is()) { + { + std::lock_guard lock(_tablet_id_mutex); + if (_tablet_id_set.erase(tablet_id) > 0 && + _tablet_id_set_size_metrics) { + _tablet_id_set_size_metrics->set_value(_tablet_id_set.size()); + } + } + { + std::lock_guard lock(_ttl_info_mutex); + _ttl_info_map.erase(tablet_id); + } + } else { + LOG(WARNING) << "Failed to get tablet meta for tablet_id: " << tablet_id + << ", err: " << meta_status; + } continue; } diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.h b/be/src/io/cache/block_file_cache_ttl_mgr.h index c08438d5d6d8d2..b16b5486b4b889 100644 --- a/be/src/io/cache/block_file_cache_ttl_mgr.h +++ b/be/src/io/cache/block_file_cache_ttl_mgr.h @@ -19,10 +19,12 @@ #pragma once +#include #include #include #include +#include #include #include #include @@ -73,6 +75,8 @@ class BlockFileCacheTtlMgr { std::thread _tablet_id_flush_thread; std::mutex _ttl_info_mutex; + + std::shared_ptr> _tablet_id_set_size_metrics; }; } // namespace doris::io \ No newline at end of file From bb825d579aeba468ff992089415e02130ed823c2 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Fri, 6 Feb 2026 11:47:42 +0800 Subject: [PATCH 20/20] [enhancement](filecache) add filesystem leak cleaner (#59269) cache directory could be inconsistence with filecache meta store somehow, so extend FSFileCacheStorage with inode-based stats, leak-cleaner thread, and orphan cleanup helpers to clean such leakage. --- be/src/common/config.cpp | 5 + be/src/common/config.h | 5 + be/src/io/cache/cache_block_meta_store.cpp | 27 + be/src/io/cache/cache_block_meta_store.h | 3 + be/src/io/cache/fs_file_cache_storage.cpp | 765 ++++++++++++++++-- be/src/io/cache/fs_file_cache_storage.h | 69 +- .../block_file_cache_test_meta_store.cpp | 89 +- ...s_file_cache_storage_leak_cleaner_test.cpp | 718 ++++++++++++++++ 8 files changed, 1556 insertions(+), 125 deletions(-) create mode 100644 be/test/io/cache/fs_file_cache_storage_leak_cleaner_test.cpp diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index c93a36b904861a..e071685c059c59 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1179,6 +1179,11 @@ DEFINE_mDouble(file_cache_keep_schema_change_output_min_hit_ratio, "0.7"); // if difference below this threshold, we consider cache's progressive upgrading (2.0->3.0) successful DEFINE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold, "0.3"); +DEFINE_mDouble(file_cache_leak_fs_to_meta_ratio_threshold, "1.3"); +DEFINE_mInt64(file_cache_leak_scan_interval_seconds, "86400"); +DEFINE_mInt32(file_cache_leak_scan_batch_files, "2048"); +DEFINE_mInt32(file_cache_leak_scan_pause_ms, "500"); +DEFINE_mInt64(file_cache_leak_grace_seconds, "3600"); DEFINE_mInt64(file_cache_remove_block_qps_limit, "1000"); DEFINE_mInt64(file_cache_background_gc_interval_ms, "100"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 6cb735e730bb55..60a3cdaca325cd 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1228,6 +1228,11 @@ DECLARE_mBool(enable_file_cache_adaptive_write); DECLARE_mDouble(file_cache_keep_base_compaction_output_min_hit_ratio); DECLARE_mDouble(file_cache_meta_store_vs_file_system_diff_num_threshold); DECLARE_mDouble(file_cache_keep_schema_change_output_min_hit_ratio); +DECLARE_mDouble(file_cache_leak_fs_to_meta_ratio_threshold); +DECLARE_mInt64(file_cache_leak_scan_interval_seconds); +DECLARE_mInt32(file_cache_leak_scan_batch_files); +DECLARE_mInt32(file_cache_leak_scan_pause_ms); +DECLARE_mInt64(file_cache_leak_grace_seconds); DECLARE_mInt64(file_cache_remove_block_qps_limit); DECLARE_mInt64(file_cache_background_gc_interval_ms); DECLARE_mInt64(file_cache_background_block_lru_update_interval_ms); diff --git a/be/src/io/cache/cache_block_meta_store.cpp b/be/src/io/cache/cache_block_meta_store.cpp index 472886152c71da..c42dd3f8003e86 100644 --- a/be/src/io/cache/cache_block_meta_store.cpp +++ b/be/src/io/cache/cache_block_meta_store.cpp @@ -332,6 +332,33 @@ std::unique_ptr CacheBlockMetaStore::get_all() { return std::unique_ptr(new RocksDBIterator(iter)); } +size_t CacheBlockMetaStore::approximate_entry_count() const { + if (!_db) { + LOG(WARNING) << "Database not initialized when counting entries"; + return 0; + } + + rocksdb::ReadOptions read_options; + std::unique_ptr iter( + _db->NewIterator(read_options, _file_cache_meta_cf_handle.get())); + if (!iter) { + LOG(WARNING) << "Failed to create iterator when counting entries"; + return 0; + } + + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + + if (!iter->status().ok()) { + LOG(WARNING) << "Iterator encountered error when counting entries: " + << iter->status().ToString(); + } + + return count; +} + void CacheBlockMetaStore::delete_key(const BlockMetaKey& key) { std::string key_str = serialize_key(key); diff --git a/be/src/io/cache/cache_block_meta_store.h b/be/src/io/cache/cache_block_meta_store.h index bd3c6501c145a3..1a6659596e1002 100644 --- a/be/src/io/cache/cache_block_meta_store.h +++ b/be/src/io/cache/cache_block_meta_store.h @@ -111,6 +111,9 @@ class CacheBlockMetaStore { // Get the approximate size of the write queue size_t get_write_queue_size() const; + // Count entries stored in rocksdb (ignoring pending writes) + size_t approximate_entry_count() const; + private: void async_write_worker(); diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index 395b42d9415ea8..5fe6dae22d9464 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -21,13 +21,27 @@ #include #include #include +#include #include - +#include + +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include #include +#include +#include #include +#include "common/config.h" #include "common/logging.h" #include "cpp/sync_point.h" #include "io/cache/block_file_cache.h" @@ -44,6 +58,21 @@ namespace doris::io { +#ifdef BE_TEST +namespace { +FSFileCacheStorage::InodeEstimationTestHooks* g_inode_estimation_hooks = nullptr; + +FSFileCacheStorage::InodeEstimationTestHooks* inode_test_hooks() { + return g_inode_estimation_hooks; +} +} // namespace + +void FSFileCacheStorage::set_inode_estimation_test_hooks( + FSFileCacheStorage::InodeEstimationTestHooks* hooks) { + g_inode_estimation_hooks = hooks; +} +#endif + struct BatchLoadArgs { UInt128Wrapper hash; CacheContext ctx; @@ -109,12 +138,16 @@ size_t FDCache::file_reader_cache_size() { return _file_reader_list.size(); } -Status FSFileCacheStorage::init(BlockFileCache* _mgr) { +Status FSFileCacheStorage::init(BlockFileCache* mgr) { + const char* metrics_prefix = mgr->_cache_base_path.c_str(); _iterator_dir_retry_cnt = std::make_shared( - _cache_base_path.c_str(), "file_cache_fs_storage_iterator_dir_retry_cnt"); - _cache_base_path = _mgr->_cache_base_path; + metrics_prefix, "file_cache_fs_storage_iterator_dir_retry_cnt"); + _leak_scan_removed_files = std::make_shared>( + metrics_prefix, "file_cache_leak_removed_files_cnt"); + _cache_base_path = mgr->_cache_base_path; + _mgr = mgr; _meta_store = std::make_unique(_cache_base_path + "/meta", 10000); - _cache_background_load_thread = std::thread([this, mgr = _mgr]() { + _cache_background_load_thread = std::thread([this, mgr]() { try { auto mem_tracker = MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::OTHER, fmt::format("FileCacheVersionReader")); @@ -140,6 +173,7 @@ Status FSFileCacheStorage::init(BlockFileCache* _mgr) { load_cache_info_into_memory(mgr); mgr->_async_open_done = true; LOG_INFO("file cache {} lazy load done.", _cache_base_path); + start_leak_cleaner(mgr); } catch (const std::exception& e) { LOG(ERROR) << "Background cache loading thread failed with exception: " << e.what(); } catch (...) { @@ -603,12 +637,12 @@ bool FSFileCacheStorage::handle_already_loaded_block( return true; } -void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const { +void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* mgr) const { int scan_length = 10000; std::vector batch_load_buffer; batch_load_buffer.reserve(scan_length); auto add_cell_batch_func = [&]() { - SCOPED_CACHE_LOCK(_mgr->_mutex, _mgr); + SCOPED_CACHE_LOCK(mgr->_mutex, mgr); auto f = [&](const BatchLoadArgs& args) { // in async load mode, a cell may be added twice. @@ -618,8 +652,8 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mg } // if the file is tmp, it means it is the old file and it should be removed if (!args.is_tmp) { - _mgr->add_cell(args.hash, args.ctx, args.offset, args.size, - FileBlock::State::DOWNLOADED, cache_lock); + mgr->add_cell(args.hash, args.ctx, args.offset, args.size, + FileBlock::State::DOWNLOADED, cache_lock); return; } std::error_code ec; @@ -637,7 +671,9 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mg for (; key_it != std::filesystem::directory_iterator(); ++key_it) { auto key_with_suffix = key_it->path().filename().native(); auto delim_pos = key_with_suffix.find('_'); - DCHECK(delim_pos != std::string::npos); + if (delim_pos == std::string::npos || delim_pos != sizeof(uint128_t) * 2) { + continue; + } std::string key_str = key_with_suffix.substr(0, delim_pos); std::string expiration_time_str = key_with_suffix.substr(delim_pos + 1); auto hash = UInt128Wrapper(vectorized::unhex_uint(key_str.c_str())); @@ -694,7 +730,7 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_fs(BlockFileCache* _mg // skip version file continue; } - if (key_prefix_it->path().filename().native() == "meta") { + if (key_prefix_it->path().filename().native() == META_DIR_NAME) { // skip rocksdb dir continue; } @@ -751,7 +787,9 @@ Status FSFileCacheStorage::get_file_cache_infos(std::vector& info for (; key_it != std::filesystem::directory_iterator(); ++key_it) { auto key_with_suffix = key_it->path().filename().native(); auto delim_pos = key_with_suffix.find('_'); - DCHECK(delim_pos != std::string::npos); + if (delim_pos == std::string::npos || delim_pos != sizeof(uint128_t) * 2) { + continue; + } std::string key_str = key_with_suffix.substr(0, delim_pos); std::string expiration_time_str = key_with_suffix.substr(delim_pos + 1); long expiration_time = std::stoul(expiration_time_str); @@ -784,12 +822,13 @@ Status FSFileCacheStorage::get_file_cache_infos(std::vector& info return Status::OK(); } -void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const { +void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* mgr) const { + TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile1"); int scan_length = 10000; std::vector batch_load_buffer; batch_load_buffer.reserve(scan_length); auto add_cell_batch_func = [&]() { - SCOPED_CACHE_LOCK(_mgr->_mutex, _mgr); + SCOPED_CACHE_LOCK(mgr->_mutex, mgr); auto f = [&](const BatchLoadArgs& args) { // in async load mode, a cell may be added twice. @@ -797,8 +836,8 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mg args.ctx.tablet_id, cache_lock)) { return; } - _mgr->add_cell(args.hash, args.ctx, args.offset, args.size, - FileBlock::State::DOWNLOADED, cache_lock); + mgr->add_cell(args.hash, args.ctx, args.offset, args.size, FileBlock::State::DOWNLOADED, + cache_lock); return; }; std::for_each(batch_load_buffer.begin(), batch_load_buffer.end(), f); @@ -863,9 +902,9 @@ void FSFileCacheStorage::load_cache_info_into_memory_from_db(BlockFileCache* _mg TEST_SYNC_POINT_CALLBACK("BlockFileCache::TmpFile2"); } -void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const { +void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* mgr) const { // First load from database - load_cache_info_into_memory_from_db(_mgr); + load_cache_info_into_memory_from_db(mgr); std::string version; auto st = read_file_cache_version(&version); @@ -877,17 +916,45 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const return; } + // If cache directory is effectively empty (no cache data entries), write version hint and + // return directly. + auto is_cache_base_path_empty = [&]() -> bool { + std::error_code ec; + std::filesystem::directory_iterator it {_cache_base_path, ec}; + if (ec) { + LOG(WARNING) << "Failed to list cache directory: " << _cache_base_path + << ", error: " << ec.message(); + return false; + } + + for (; it != std::filesystem::directory_iterator(); ++it) { + auto name = it->path().filename().native(); + if (name == META_DIR_NAME || name == "version") { + continue; + } + return false; + } + return true; + }; + + if (is_cache_base_path_empty()) { + if (st = write_file_cache_version(); !st.ok()) { + LOG(WARNING) << "Failed to write version hints for file cache, err=" << st.to_string(); + } + return; + } + // Count blocks loaded from database size_t db_block_count = 0; { - std::lock_guard lock(_mgr->_mutex); - for (const auto& hash_entry : _mgr->_files) { + std::lock_guard lock(mgr->_mutex); + for (const auto& hash_entry : mgr->_files) { db_block_count += hash_entry.second.size(); } } // Estimate file count from filesystem using statfs - size_t estimated_file_count = estimate_file_count_from_statfs(); + size_t estimated_file_count = estimate_file_count_from_inode(); LOG(INFO) << "Cache loading statistics - DB blocks: " << db_block_count << ", Estimated FS files: " << estimated_file_count; @@ -960,7 +1027,7 @@ Status FSFileCacheStorage::clear(std::string& msg) { auto t0 = std::chrono::steady_clock::now(); for (; key_it != std::filesystem::directory_iterator(); ++key_it) { if (!key_it->is_directory()) continue; // all file cache data is in sub-directories - if (key_it->path().filename().native() == "meta") continue; + if (key_it->path().filename().native() == META_DIR_NAME) continue; ++total; std::string cache_key = key_it->path().string(); auto st = global_local_filesystem()->delete_directory(cache_key); @@ -991,76 +1058,632 @@ FSFileCacheStorage::~FSFileCacheStorage() { if (_cache_background_load_thread.joinable()) { _cache_background_load_thread.join(); } + stop_leak_cleaner(); } -size_t FSFileCacheStorage::estimate_file_count_from_statfs() const { - struct statvfs vfs; - if (statvfs(_cache_base_path.c_str(), &vfs) != 0) { - LOG(WARNING) << "Failed to get filesystem statistics for path: " << _cache_base_path - << ", error: " << strerror(errno); +size_t FSFileCacheStorage::estimate_file_count_from_inode() const { + int64_t duration_ns = 0; + size_t cache_files = 0; + { + SCOPED_RAW_TIMER(&duration_ns); + do { + struct statvfs vfs {}; + int statvfs_res = 0; +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->statvfs_override) { + statvfs_res = hooks->statvfs_override(_cache_base_path, &vfs); + } else +#endif + { + statvfs_res = statvfs(_cache_base_path.c_str(), &vfs); + } + if (statvfs_res != 0) { + LOG(WARNING) << "Failed to get filesystem statistics for path: " << _cache_base_path + << ", error: " << strerror(errno); + break; + } + + if (vfs.f_files == 0) { + LOG(WARNING) << "Filesystem returned zero total inodes for path " + << _cache_base_path; + break; + } + + struct stat cache_stat {}; + int lstat_res = 0; +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->lstat_override) { + lstat_res = hooks->lstat_override(_cache_base_path, &cache_stat); + } else +#endif + { + lstat_res = lstat(_cache_base_path.c_str(), &cache_stat); + } + if (lstat_res != 0) { + LOG(WARNING) << "Failed to stat cache base path " << _cache_base_path << ": " + << strerror(errno); + break; + } + + size_t total_inodes_used = vfs.f_files - vfs.f_ffree; + size_t non_cache_inodes = estimate_non_cache_inode_usage(); + size_t directory_inodes = estimate_cache_directory_inode_usage(); + + if (total_inodes_used > non_cache_inodes + directory_inodes) { + cache_files = total_inodes_used - non_cache_inodes - directory_inodes; + } else { + LOG(WARNING) << fmt::format( + "Inode subtraction underflow: total={} non_cache={} directory={}", + total_inodes_used, non_cache_inodes, directory_inodes); + } + + LOG(INFO) << fmt::format( + "Cache inode estimation: total_used={}, non_cache={}, directories≈{}, files≈{}", + total_inodes_used, non_cache_inodes, directory_inodes, cache_files); + } while (false); + } + const double duration_ms = static_cast(duration_ns) / 1'000'000.0; + LOG(INFO) << fmt::format("estimate_file_count_from_inode duration_ms={:.3f}, files={}", + duration_ms, cache_files); + return cache_files; +} + +size_t FSFileCacheStorage::count_inodes_for_path( + const std::filesystem::path& path, dev_t target_dev, + const std::filesystem::path& excluded_root, + std::unordered_set& visited) const { +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->count_inodes_override) { + return hooks->count_inodes_override(*this, path, target_dev, excluded_root, visited); + } +#endif + if (!excluded_root.empty()) { + std::error_code eq_ec; + bool is_excluded = std::filesystem::equivalent(path, excluded_root, eq_ec); + if (eq_ec) { + LOG(WARNING) << "Failed to compare " << path << " with " << excluded_root << ": " + << eq_ec.message(); + } else if (is_excluded) { + return 0; + } + } + + struct stat st {}; + if (lstat(path.c_str(), &st) != 0) { + LOG(WARNING) << "Failed to stat path " << path << ": " << strerror(errno); + return 0; + } + if (st.st_dev != target_dev) { + return 0; + } + InodeKey key {st.st_dev, st.st_ino}; + if (!visited.insert(key).second) { return 0; } - // Get total size of cache directory to estimate file count + size_t count = 1; + if (S_ISDIR(st.st_mode)) { + std::error_code ec; + for (std::filesystem::directory_iterator it {path, ec}; + !ec && it != std::filesystem::directory_iterator(); ++it) { + count += count_inodes_for_path(it->path(), target_dev, excluded_root, visited); + } + if (ec) { + LOG(WARNING) << "Failed to iterate directory " << path << ": " << ec.message(); + } + } + return count; +} + +bool FSFileCacheStorage::is_cache_prefix_directory( + const std::filesystem::directory_entry& entry) const { + if (!entry.is_directory()) { + return false; + } + auto name = entry.path().filename().native(); + if (name == META_DIR_NAME || name.empty()) { + return false; + } + if (name.size() != KEY_PREFIX_LENGTH) { + return false; + } + return std::all_of(name.begin(), name.end(), [](unsigned char c) { return std::isxdigit(c); }); +} + +std::filesystem::path FSFileCacheStorage::find_mount_root(dev_t cache_dev) const { +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->find_mount_root_override) { + return hooks->find_mount_root_override(*this, cache_dev); + } +#endif std::error_code ec; - uintmax_t total_size = 0; - std::vector pending_dirs {std::filesystem::path(_cache_base_path)}; - while (!pending_dirs.empty()) { - auto current_dir = pending_dirs.back(); - pending_dirs.pop_back(); + std::filesystem::path current = std::filesystem::absolute(_cache_base_path, ec); + if (ec) { + LOG(WARNING) << "Failed to resolve absolute cache base path " << _cache_base_path << ": " + << ec.message(); + current = _cache_base_path; + } - std::filesystem::directory_iterator it(current_dir, ec); - if (ec) { - LOG(WARNING) << "Failed to list directory while estimating file count, dir=" - << current_dir << ", err=" << ec.message(); - ec.clear(); + std::filesystem::path result = current; + while (result.has_parent_path()) { + auto parent = result.parent_path(); + if (parent.empty() || parent == result) { + break; + } + struct stat st {}; + if (lstat(parent.c_str(), &st) != 0) { + LOG(WARNING) << "Failed to stat parent path " << parent << ": " << strerror(errno); + break; + } + if (st.st_dev != cache_dev) { + break; + } + result = parent; + } + return result; +} + +size_t FSFileCacheStorage::estimate_non_cache_inode_usage() const { +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->non_cache_override) { + return hooks->non_cache_override(*this); + } +#endif + struct stat cache_stat {}; + if (lstat(_cache_base_path.c_str(), &cache_stat) != 0) { + LOG(WARNING) << "Failed to stat cache base path " << _cache_base_path << ": " + << strerror(errno); + return 0; + } + + auto mount_root = find_mount_root(cache_stat.st_dev); + if (mount_root.empty()) { + LOG(WARNING) << "Failed to determine mount root for cache path " << _cache_base_path; + return 0; + } + + std::unordered_set visited; + std::error_code abs_ec; + std::filesystem::path excluded = std::filesystem::absolute(_cache_base_path, abs_ec); + if (abs_ec) { + LOG(WARNING) << "Failed to get absolute cache base path " << _cache_base_path << ": " + << abs_ec.message(); + excluded = _cache_base_path; + } + + return count_inodes_for_path(mount_root, cache_stat.st_dev, excluded, visited); +} + +size_t FSFileCacheStorage::estimate_cache_directory_inode_usage() const { +#ifdef BE_TEST + if (auto* hooks = inode_test_hooks(); hooks && hooks->cache_dir_override) { + return hooks->cache_dir_override(*this); + } +#endif + constexpr size_t kSampleLimit = 3; + size_t prefix_dirs = 0; + std::vector samples; + + std::error_code ec; + std::filesystem::directory_iterator it {_cache_base_path, ec}; + if (ec) { + LOG(WARNING) << "Failed to list cache base path for directory estimation: " << ec.message(); + return 0; + } + + for (; it != std::filesystem::directory_iterator(); ++it) { + if (!is_cache_prefix_directory(*it)) { continue; } + ++prefix_dirs; + if (samples.size() < kSampleLimit) { + samples.emplace_back(it->path()); + } + } - for (; it != std::filesystem::directory_iterator(); ++it) { - std::error_code status_ec; - auto entry_status = it->symlink_status(status_ec); - if (status_ec) { - LOG(WARNING) << "Failed to stat entry while estimating file count, path=" - << it->path() << ", err=" << status_ec.message(); - continue; + if (prefix_dirs == 0 || samples.empty()) { + return 0; + } + + size_t sampled_second_level = 0; + for (const auto& prefix_path : samples) { + size_t local_count = 0; + std::error_code sample_ec; + for (std::filesystem::directory_iterator prefix_it {prefix_path, sample_ec}; + !sample_ec && prefix_it != std::filesystem::directory_iterator(); ++prefix_it) { + if (prefix_it->is_directory()) { + ++local_count; } + } + if (sample_ec) { + LOG(WARNING) << "Failed to enumerate prefix directory " << prefix_path << ": " + << sample_ec.message(); + sample_ec.clear(); + } + sampled_second_level += local_count; + } - if (std::filesystem::is_directory(entry_status)) { - auto next_dir = it->path(); - TEST_SYNC_POINT_CALLBACK( - "FSFileCacheStorage::estimate_file_count_from_statfs::OnDirectory", - &next_dir); - pending_dirs.emplace_back(next_dir); - continue; + double average_second_level = static_cast(sampled_second_level) / samples.size(); + size_t estimated_second_level = + static_cast(std::llround(average_second_level * prefix_dirs)); + return prefix_dirs + estimated_second_level; +} + +size_t FSFileCacheStorage::snapshot_metadata_block_count(BlockFileCache* /*mgr*/) const { + // TODO(zhengyu): if the cache_lock problem is solved, we can then use _mgr + int64_t duration_ns = 0; + size_t block_count = 0; + { + SCOPED_RAW_TIMER(&duration_ns); + if (_meta_store) { + block_count = _meta_store->approximate_entry_count(); + } else { + LOG(INFO) << "snapshot_metadata_block_count skipped because meta store is null"; + block_count = 0; + } + } + const double duration_ms = static_cast(duration_ns) / 1'000'000.0; + LOG(INFO) << fmt::format("snapshot_metadata_block_count duration_ms={:.3f}, blocks={}", + duration_ms, block_count); + return block_count; +} + +std::vector FSFileCacheStorage::snapshot_metadata_for_hash_offsets( + BlockFileCache* mgr, const UInt128Wrapper& hash) const { + std::vector offsets; + std::lock_guard lock(mgr->_mutex); + auto it = mgr->_files.find(hash); + if (it == mgr->_files.end()) { + return offsets; + } + offsets.reserve(it->second.size()); + for (const auto& [offset, _] : it->second) { + offsets.push_back(offset); + } + return offsets; +} + +void FSFileCacheStorage::start_leak_cleaner(BlockFileCache* mgr) { + if (config::file_cache_leak_scan_interval_seconds <= 0) { + LOG(WARNING) << "File cache leak cleaner disabled because interval <= 0"; + return; + } + + // if version file not 3.0 then just return, clean nothing + std::string version; + if (auto st = read_file_cache_version(&version); !st.ok()) { + LOG(WARNING) << "Failed to read file cache version: " << st.to_string(); + return; + } + if (version != "3.0") { + LOG(WARNING) << "File cache leak cleaner skipped because version is not 3.0"; + return; + } + + _stop_leak_cleaner.store(false, std::memory_order_relaxed); + _cache_leak_cleaner_thread = std::thread([this]() { leak_cleaner_loop(); }); +} + +void FSFileCacheStorage::stop_leak_cleaner() { + _stop_leak_cleaner.store(true, std::memory_order_relaxed); + _leak_cleaner_cv.notify_all(); + if (_cache_leak_cleaner_thread.joinable()) { + _cache_leak_cleaner_thread.join(); + } +} + +void FSFileCacheStorage::leak_cleaner_loop() { + Thread::set_self_name("leak_cleaner_loop"); + + // randomly waiting before start the loop helps avoid thundering herd problem + // for all strorages. + const int64_t interval_seconds = + std::max(1, config::file_cache_leak_scan_interval_seconds); + std::mt19937_64 rng(std::random_device {}()); + std::uniform_int_distribution dist(0, interval_seconds); + int64_t initial_delay = dist(rng); + TEST_SYNC_POINT_CALLBACK("FSFileCacheStorage::leak_cleaner_loop::initial_delay", + &initial_delay); + if (initial_delay > 0) { + std::unique_lock lock(_leak_cleaner_mutex); + _leak_cleaner_cv.wait_for(lock, std::chrono::seconds(initial_delay), [this]() { + return _stop_leak_cleaner.load(std::memory_order_relaxed); + }); + lock.unlock(); + if (_stop_leak_cleaner.load(std::memory_order_relaxed)) { + return; + } + } + + while (!_stop_leak_cleaner.load(std::memory_order_relaxed)) { + int64_t interval_s = interval_seconds; + TEST_SYNC_POINT_CALLBACK("FSFileCacheStorage::leak_cleaner_loop::interval", &interval_s); + auto interval = std::chrono::seconds(interval_s); + std::unique_lock lock(_leak_cleaner_mutex); + _leak_cleaner_cv.wait_for(lock, interval, [this]() { + return _stop_leak_cleaner.load(std::memory_order_relaxed); + }); + lock.unlock(); + if (_stop_leak_cleaner.load(std::memory_order_relaxed)) { + break; + } + try { + TEST_SYNC_POINT_CALLBACK("FSFileCacheStorage::leak_cleaner_loop::before_run"); + run_leak_cleanup(_mgr); + } catch (const std::exception& e) { + LOG(WARNING) << "File cache leak cleaner encountered exception: " << e.what(); + } catch (...) { + LOG(WARNING) << "File cache leak cleaner encountered unknown exception"; + } + } +} + +void FSFileCacheStorage::run_leak_cleanup(BlockFileCache* mgr) { + size_t metadata_blocks = snapshot_metadata_block_count(mgr); + if (metadata_blocks == 0) { + LOG(INFO) << "file cache leak scan found zero metadata blocks, skip cleanup"; + return; + } + + size_t fs_files = estimate_file_count_from_inode(); + double ratio = static_cast(fs_files) / static_cast(metadata_blocks); + + LOG(INFO) << fmt::format( + "file cache leak scan stats: fs_files={}, metadata_blocks={}, ratio={:.4f}", fs_files, + metadata_blocks, ratio); + + double threshold = config::file_cache_leak_fs_to_meta_ratio_threshold; + if (ratio <= threshold) { + LOG_INFO("file cache leak ratio {0:.4f} within threshold {1:.4f}, no cleanup needed", ratio, + threshold); + return; + } + + LOG(WARNING) << fmt::format( + "file cache leak ratio {0:.4f} exceeds threshold {1:.4f}, start cleanup", ratio, + threshold); + + cleanup_leaked_files(mgr, metadata_blocks); +} + +void FSFileCacheStorage::cleanup_leaked_files(BlockFileCache* mgr, size_t metadata_block_count) { + const size_t batch_size = std::max(1, config::file_cache_leak_scan_batch_files); + const size_t pause_ms = std::max(0, config::file_cache_leak_scan_pause_ms); + + int64_t cleanup_wall_time_ns = 0; + int64_t metadata_hash_time_ns = 0; + int64_t metadata_index_time_ns = 0; + int64_t remove_candidates_time_ns = 0; + int64_t directory_loop_time_ns = 0; + size_t removed_files = 0; + size_t examined_files = 0; + + std::vector hash_keys; + + { + SCOPED_RAW_TIMER(&cleanup_wall_time_ns); + { + SCOPED_RAW_TIMER(&metadata_hash_time_ns); + std::lock_guard lock(mgr->_mutex); + hash_keys.reserve(mgr->_files.size()); + for (const auto& [hash, _] : mgr->_files) { + hash_keys.push_back(hash); } + } - if (std::filesystem::is_regular_file(entry_status)) { - std::error_code size_ec; - auto file_size = it->file_size(size_ec); - if (size_ec) { - LOG(WARNING) << "Failed to get file size while estimating file count, path=" - << it->path() << ", err=" << size_ec.message(); - continue; + std::unordered_set metadata_index; + if (metadata_block_count > 0) { + metadata_index.reserve(metadata_block_count * 2); + } + + { + SCOPED_RAW_TIMER(&metadata_index_time_ns); + for (const auto& hash : hash_keys) { + auto offsets = snapshot_metadata_for_hash_offsets(mgr, hash); + for (const auto& offset : offsets) { + metadata_index.emplace(hash, offset); } - total_size += file_size; } } - } - if (total_size == 0) { - return 0; - } + struct OrphanCandidate { + std::string path; + UInt128Wrapper hash; + size_t offset; + std::string key_dir; + }; + + auto try_remove_empty_directory = [&](const std::string& dir) { + std::error_code ec; + std::filesystem::directory_iterator it(dir, ec); + if (ec || it != std::filesystem::directory_iterator()) { + return; + } + auto st = fs->delete_directory(dir); + if (!st.ok() && !st.is()) { + LOG_WARNING("delete_directory {} failed", dir).error(st); + } + }; + + std::vector candidates; + candidates.reserve(batch_size); + + auto remove_candidates = [&]() { + if (candidates.empty()) { + return; + } + int64_t remove_once_ns = 0; + { + SCOPED_RAW_TIMER(&remove_once_ns); + for (auto& candidate : candidates) { + auto st = fs->delete_file(candidate.path); + if (!st.ok() && !st.is()) { + LOG_WARNING("delete orphan cache file {} failed", candidate.path).error(st); + continue; + } + removed_files++; + try_remove_empty_directory(candidate.key_dir); + auto prefix_dir = + std::filesystem::path(candidate.key_dir).parent_path().string(); + try_remove_empty_directory(prefix_dir); + } + candidates.clear(); + } + remove_candidates_time_ns += remove_once_ns; + if (pause_ms > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(pause_ms)); + } + }; + + std::error_code ec; + std::filesystem::directory_iterator prefix_it {_cache_base_path, ec}; + if (ec) { + LOG(WARNING) << "Leak scan failed to list cache directory: " << _cache_base_path + << ", error: " << ec.message(); + return; + } + + for (; prefix_it != std::filesystem::directory_iterator(); ++prefix_it) { + int64_t loop_once_ns = 0; + { + SCOPED_RAW_TIMER(&loop_once_ns); + std::string prefix_name = prefix_it->path().filename().native(); + if (!prefix_it->is_directory() || prefix_name == META_DIR_NAME || + prefix_name.size() != KEY_PREFIX_LENGTH) { + continue; + } + + std::filesystem::directory_iterator key_it {prefix_it->path(), ec}; + if (ec) { + LOG(WARNING) << "Leak scan failed to list prefix " << prefix_it->path().native() + << ", error: " << ec.message(); + continue; + } - // Estimate file count based on average file size - // Assuming average file size of 1MB for cache blocks - const uintmax_t average_file_size = 1024 * 1024; // 1MB - size_t estimated_file_count = total_size / average_file_size; + for (; key_it != std::filesystem::directory_iterator(); ++key_it) { + if (!key_it->is_directory()) { + continue; + } + auto key_with_suffix = key_it->path().filename().native(); + auto delim_pos = key_with_suffix.find('_'); + if (delim_pos == std::string::npos || delim_pos != sizeof(uint128_t) * 2) { + continue; + } + + UInt128Wrapper hash; + try { + hash = UInt128Wrapper(vectorized::unhex_uint( + key_with_suffix.substr(0, delim_pos).c_str())); + } catch (...) { + LOG(WARNING) << "Leak scan failed to parse hash from " << key_with_suffix; + continue; + } + + long expiration = 0; + try { + expiration = std::stol(key_with_suffix.substr(delim_pos + 1)); + } catch (...) { + LOG(WARNING) + << "Leak scan failed to parse expiration from " << key_with_suffix; + continue; + } + + std::filesystem::directory_iterator offset_it {key_it->path(), ec}; + if (ec) { + LOG(WARNING) << "Leak scan failed to list key directory " + << key_it->path().native() << ", error: " << ec.message(); + continue; + } + + for (; offset_it != std::filesystem::directory_iterator(); ++offset_it) { + if (!offset_it->is_regular_file()) { + continue; + } + const auto file_path = offset_it->path(); + const std::string file_path_str = file_path.string(); + size_t file_size = offset_it->file_size(ec); + if (ec) { + LOG(WARNING) << "Leak scan failed to fetch file size of " + << file_path.native() << ": " << ec.message(); + continue; + } + + size_t offset = 0; + bool is_tmp = false; + FileCacheType cache_type = FileCacheType::NORMAL; + Status st = parse_filename_suffix_to_cache_type( + fs, offset_it->path().filename().native(), expiration, file_size, + &offset, &is_tmp, &cache_type); + if (!st.ok()) { + continue; + } + + AccessKeyAndOffset meta_key {hash, offset}; + + // If the file is present in metadata and not a tmp file, skip it. + if (!is_tmp && metadata_index.find(meta_key) != metadata_index.end()) { + continue; + } + + // For any file that is not referenced by metadata (or tmp files), + // protect recently-created files from immediate deletion. This avoids + // racing with writers. The grace window is configured by + // file_cache_leak_grace_seconds and applies to all orphan files. + const int64_t grace_seconds = + std::max(0, config::file_cache_leak_grace_seconds); + if (grace_seconds > 0) { + struct stat st_buf {}; + if (::stat(file_path.c_str(), &st_buf) != 0) { + LOG(WARNING) << "Leak scan failed to stat file " << file_path_str + << ": " << strerror(errno); + } else { + const std::time_t now = std::time(nullptr); + if (now == static_cast(-1)) { + LOG(WARNING) + << "Leak scan failed to get current time when checking " + << file_path_str; + } else { + const int64_t age_seconds = + static_cast(now) - + static_cast(st_buf.st_mtime); + if (age_seconds < grace_seconds) { + VLOG_DEBUG << fmt::format( + "Leak scan skipping young orphan file {} because " + "age={}s < grace={}s", + file_path_str, age_seconds, grace_seconds); + continue; + } + } + } + } + + candidates.emplace_back(file_path_str, hash, offset, + key_it->path().string()); + examined_files++; + if (candidates.size() >= batch_size) { + remove_candidates(); + } + } + } + } + directory_loop_time_ns += loop_once_ns; + } - LOG(INFO) << "Estimated file count for cache path " << _cache_base_path - << ": total_size=" << total_size << ", estimated_files=" << estimated_file_count; + remove_candidates(); + } - return estimated_file_count; + auto ns_to_ms = [](int64_t ns) { return static_cast(ns) / 1'000'000.0; }; + + LOG(INFO) << fmt::format( + "file cache leak cleanup finished: examined_files={}, removed_orphans={}, " + "wall_time_ms={:.3f}, metadata_hash_time_ms={:.3f}, metadata_index_ms={:.3f}, " + "remove_candidates_ms={:.3f}, prefix_loop_ms={:.3f}", + examined_files, removed_files, ns_to_ms(cleanup_wall_time_ns), + ns_to_ms(metadata_hash_time_ns), ns_to_ms(metadata_index_time_ns), + ns_to_ms(remove_candidates_time_ns), ns_to_ms(directory_loop_time_ns)); + if (_leak_scan_removed_files) { + *_leak_scan_removed_files << removed_files; + } } } // namespace doris::io diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index 24857968f12a8c..721cca54bff327 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -18,10 +18,21 @@ #pragma once #include - +#include +#include +#include + +#include +#include +#include +#include +#include #include +#include #include #include +#include +#include #include "io/cache/cache_block_meta_store.h" #include "io/cache/file_cache_common.h" @@ -58,6 +69,7 @@ class FSFileCacheStorage : public FileCacheStorage { /// version 1.0: cache_base_path / key / offset /// version 2.0: cache_base_path / key_prefix / key / offset static constexpr int KEY_PREFIX_LENGTH = 3; + static constexpr std::string META_DIR_NAME = "meta"; FSFileCacheStorage() = default; ~FSFileCacheStorage() override; @@ -89,6 +101,36 @@ class FSFileCacheStorage : public FileCacheStorage { // Get the meta store instance (only available for DISK storage type) CacheBlockMetaStore* get_meta_store() { return _meta_store.get(); } + struct InodeKey { + dev_t device; + ino_t inode; + bool operator==(const InodeKey& other) const { + return device == other.device && inode == other.inode; + } + }; + struct InodeKeyHash { + size_t operator()(const InodeKey& key) const { + return std::hash()((static_cast(key.device) << 32) ^ + static_cast(key.inode)); + } + }; + +#ifdef BE_TEST + struct InodeEstimationTestHooks { + std::function statvfs_override; + std::function lstat_override; + std::function non_cache_override; + std::function cache_dir_override; + std::function + find_mount_root_override; + std::function&)> + count_inodes_override; + }; + static void set_inode_estimation_test_hooks(InodeEstimationTestHooks* hooks); +#endif + private: void remove_old_version_directories(); @@ -115,8 +157,23 @@ class FSFileCacheStorage : public FileCacheStorage { std::lock_guard& cache_lock) const; private: - // Helper function to count files in cache directory using statfs - size_t estimate_file_count_from_statfs() const; + // Helper function to count files in cache directory using inode stats + size_t estimate_file_count_from_inode() const; + size_t estimate_non_cache_inode_usage() const; + size_t estimate_cache_directory_inode_usage() const; + size_t count_inodes_for_path(const std::filesystem::path& path, dev_t target_dev, + const std::filesystem::path& excluded_root, + std::unordered_set& visited) const; + std::filesystem::path find_mount_root(dev_t cache_dev) const; + bool is_cache_prefix_directory(const std::filesystem::directory_entry& entry) const; + size_t snapshot_metadata_block_count(BlockFileCache* mgr) const; + std::vector snapshot_metadata_for_hash_offsets(BlockFileCache* mgr, + const UInt128Wrapper& hash) const; + void start_leak_cleaner(BlockFileCache* mgr); + void stop_leak_cleaner(); + void leak_cleaner_loop(); + void run_leak_cleanup(BlockFileCache* mgr); + void cleanup_leaked_files(BlockFileCache* mgr, size_t metadata_block_count); void load_cache_info_into_memory_from_fs(BlockFileCache* _mgr) const; void load_cache_info_into_memory_from_db(BlockFileCache* _mgr) const; @@ -124,12 +181,18 @@ class FSFileCacheStorage : public FileCacheStorage { std::lock_guard& cache_lock) const override; std::string _cache_base_path; + BlockFileCache* _mgr {nullptr}; std::thread _cache_background_load_thread; + std::thread _cache_leak_cleaner_thread; + std::atomic _stop_leak_cleaner {false}; + std::condition_variable _leak_cleaner_cv; + std::mutex _leak_cleaner_mutex; const std::shared_ptr& fs = global_local_filesystem(); // TODO(Lchangliang): use a more efficient data structure std::mutex _mtx; std::unordered_map _key_to_writer; std::shared_ptr _iterator_dir_retry_cnt; + std::shared_ptr> _leak_scan_removed_files; std::unique_ptr _meta_store; }; diff --git a/be/test/io/cache/block_file_cache_test_meta_store.cpp b/be/test/io/cache/block_file_cache_test_meta_store.cpp index e33bef8e8536cd..34006c11b36c12 100644 --- a/be/test/io/cache/block_file_cache_test_meta_store.cpp +++ b/be/test/io/cache/block_file_cache_test_meta_store.cpp @@ -430,6 +430,44 @@ TEST_F(BlockFileCacheTest, version3_add_remove_restart) { } } +TEST_F(BlockFileCacheTest, version3_write_version_when_cache_dir_empty) { + if (fs::exists(cache_base_path)) { + fs::remove_all(cache_base_path); + } + fs::create_directories(cache_base_path); + + io::FileCacheSettings settings; + settings.storage = "disk"; + settings.capacity = 10_mb; + settings.max_file_block_size = 1_mb; + settings.max_query_cache_size = settings.capacity; + settings.disposable_queue_size = settings.capacity; + settings.disposable_queue_elements = 8; + settings.index_queue_size = settings.capacity; + settings.index_queue_elements = 8; + settings.query_queue_size = settings.capacity; + settings.query_queue_elements = 8; + settings.ttl_queue_size = settings.capacity; + settings.ttl_queue_elements = 8; + + io::BlockFileCache cache(cache_base_path, settings); + ASSERT_TRUE(cache.initialize()); + + for (int i = 0; i < 100; ++i) { + if (cache.get_async_open_success()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + ASSERT_TRUE(cache.get_async_open_success()); + + std::ifstream ifs(cache_base_path + "/version", std::ios::binary); + ASSERT_TRUE(ifs.good()); + char buf[3] = {0}; + ifs.read(buf, 3); + ASSERT_EQ(std::string(buf, static_cast(ifs.gcount())), "3.0"); +} + TEST_F(BlockFileCacheTest, clear_retains_meta_directory_and_clears_meta_entries) { config::enable_evict_file_cache_in_advance = false; if (fs::exists(cache_base_path)) { @@ -581,57 +619,6 @@ TEST_F(BlockFileCacheTest, HandleAlreadyLoadedBlockUpdatesSizeAndTablet) { } } -TEST_F(BlockFileCacheTest, estimate_file_count_skips_removed_directory) { - std::string test_dir = cache_base_path + "/estimate_file_count_removed_dir"; - if (fs::exists(test_dir)) { - fs::remove_all(test_dir); - } - auto keep_dir = fs::path(test_dir) / "keep"; - auto remove_dir = fs::path(test_dir) / "remove"; - fs::create_directories(keep_dir); - fs::create_directories(remove_dir); - - auto keep_file = keep_dir / "data.bin"; - std::string one_mb(1024 * 1024, 'd'); - { - std::ofstream ofs(keep_file, std::ios::binary); - ASSERT_TRUE(ofs.good()); - for (int i = 0; i < 3; ++i) { - ofs.write(one_mb.data(), one_mb.size()); - ASSERT_TRUE(ofs.good()); - } - } - - FSFileCacheStorage storage; - storage._cache_base_path = test_dir; - - const std::string sync_point_name = - "FSFileCacheStorage::estimate_file_count_from_statfs::OnDirectory"; - auto* sync_point = doris::SyncPoint::get_instance(); - doris::SyncPoint::CallbackGuard guard(sync_point_name); - sync_point->set_call_back( - sync_point_name, - [remove_dir](std::vector&& args) { - auto* path = doris::try_any_cast(args[0]); - if (*path == remove_dir) { - fs::remove_all(remove_dir); - } - }, - &guard); - sync_point->enable_processing(); - - size_t estimated_files = storage.estimate_file_count_from_statfs(); - - sync_point->disable_processing(); - - ASSERT_EQ(3, estimated_files); - ASSERT_FALSE(fs::exists(remove_dir)); - - if (fs::exists(test_dir)) { - fs::remove_all(test_dir); - } -} - //TODO(zhengyu): check lazy load //TODO(zhengyu): check version2 start //TODO(zhengyu): check version2 version3 mixed start diff --git a/be/test/io/cache/fs_file_cache_storage_leak_cleaner_test.cpp b/be/test/io/cache/fs_file_cache_storage_leak_cleaner_test.cpp new file mode 100644 index 00000000000000..58ccc623e14d1a --- /dev/null +++ b/be/test/io/cache/fs_file_cache_storage_leak_cleaner_test.cpp @@ -0,0 +1,718 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wkeyword-macro" +#endif +#define private public +#define protected public +#if defined(__clang__) +#pragma clang diagnostic pop +#endif +#include "io/cache/block_file_cache.h" +#include "io/cache/fs_file_cache_storage.h" +#undef private +#undef protected + +#include "block_file_cache_test_common.h" + +namespace doris::io { + +namespace fs = std::filesystem; + +class ScopedLeakCleanerConfig { +public: + ScopedLeakCleanerConfig() + : ratio(config::file_cache_leak_fs_to_meta_ratio_threshold), + interval(config::file_cache_leak_scan_interval_seconds), + batch(config::file_cache_leak_scan_batch_files), + pause(config::file_cache_leak_scan_pause_ms), + grace(config::file_cache_leak_grace_seconds) { + config::file_cache_leak_grace_seconds = 0; + } + + ~ScopedLeakCleanerConfig() { + config::file_cache_leak_fs_to_meta_ratio_threshold = ratio; + config::file_cache_leak_scan_interval_seconds = interval; + config::file_cache_leak_scan_batch_files = batch; + config::file_cache_leak_scan_pause_ms = pause; + config::file_cache_leak_grace_seconds = grace; + } + +private: + double ratio; + int64_t interval; + int32_t batch; + int32_t pause; + int64_t grace; +}; + +class ScopedInodeTestHooks { +public: + ScopedInodeTestHooks() { FSFileCacheStorage::set_inode_estimation_test_hooks(&hooks); } + ~ScopedInodeTestHooks() { FSFileCacheStorage::set_inode_estimation_test_hooks(nullptr); } + + FSFileCacheStorage::InodeEstimationTestHooks hooks; +}; + +class FSFileCacheLeakCleanerTest : public BlockFileCacheTest { +protected: + static FileCacheSettings default_settings() { + FileCacheSettings settings; + settings.capacity = 10 * 1024 * 1024; + settings.max_file_block_size = 1 * 1024 * 1024; + settings.max_query_cache_size = settings.capacity; + settings.disposable_queue_size = settings.capacity; + settings.disposable_queue_elements = 8; + settings.index_queue_size = settings.capacity; + settings.index_queue_elements = 8; + settings.query_queue_size = settings.capacity; + settings.query_queue_elements = 8; + settings.ttl_queue_size = settings.capacity; + settings.ttl_queue_elements = 8; + settings.storage = "disk"; + return settings; + } + + fs::path prepare_test_dir(const std::string& name) const { + fs::path dir = caches_dir / "leak_cleaner" / name; + std::error_code ec; + fs::remove_all(dir, ec); + fs::create_directories(dir, ec); + return dir; + } + + static std::string current_test_name() { + if (auto* info = ::testing::UnitTest::GetInstance()->current_test_info()) { + return std::string(info->name()); + } + return "unknown"; + } + + fs::path prepare_test_dir() const { return prepare_test_dir(current_test_name()); } + + void setup_storage(FSFileCacheStorage& storage, BlockFileCache& mgr, const fs::path& dir) { + storage._cache_base_path = dir.string(); + storage._mgr = &mgr; + storage._meta_store = std::make_unique(dir.string() + "/meta", 10000); + EXPECT_TRUE(storage._meta_store->init().ok()); + EXPECT_TRUE(storage.write_file_cache_version().ok()); + } + + static void add_metadata_entry(BlockFileCache& mgr, FSFileCacheStorage& storage, + const UInt128Wrapper& hash, size_t offset) { + { + std::lock_guard l(mgr._mutex); + mgr._files[hash].try_emplace(offset); + } + if (storage._meta_store) { + BlockMetaKey mkey(0, hash, offset); + BlockMeta meta(FileCacheType::NORMAL, 16, 0); + storage._meta_store->put(mkey, meta); + // Wait for async write to complete for test stability + for (int i = 0; i < 100 && storage._meta_store->get_write_queue_size() > 0; ++i) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + } + } + + static void create_regular_file(const std::string& path, char fill = 'x') { + fs::create_directories(fs::path(path).parent_path()); + std::ofstream ofs(path, std::ios::binary | std::ios::trunc); + ASSERT_TRUE(ofs.good()); + std::string payload(16, fill); + ofs.write(payload.data(), payload.size()); + ofs.close(); + ASSERT_TRUE(std::filesystem::exists(path)); + } +}; + +TEST_F(FSFileCacheLeakCleanerTest, disable_when_interval_non_positive) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_interval_seconds = 0; + auto dir = prepare_test_dir(); + + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + storage.start_leak_cleaner(&mgr); + EXPECT_FALSE(storage._cache_leak_cleaner_thread.joinable()); + EXPECT_FALSE(storage._stop_leak_cleaner.load(std::memory_order_relaxed)); + + storage.stop_leak_cleaner(); + EXPECT_TRUE(storage._stop_leak_cleaner.load(std::memory_order_relaxed)); +} + +TEST_F(FSFileCacheLeakCleanerTest, start_and_stop_thread) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_interval_seconds = 1; + config::file_cache_leak_fs_to_meta_ratio_threshold = 1e12; + config::file_cache_leak_scan_batch_files = 4; + config::file_cache_leak_scan_pause_ms = 0; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + add_metadata_entry(mgr, storage, BlockFileCache::hash("thread_guard"), 0); + + storage.start_leak_cleaner(&mgr); + ASSERT_TRUE(storage._cache_leak_cleaner_thread.joinable()); + + storage.stop_leak_cleaner(); + EXPECT_TRUE(storage._stop_leak_cleaner.load(std::memory_order_relaxed)); + EXPECT_FALSE(storage._cache_leak_cleaner_thread.joinable()); +} + +TEST_F(FSFileCacheLeakCleanerTest, skip_cleanup_when_ratio_below_threshold) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_fs_to_meta_ratio_threshold = 1e12; + config::file_cache_leak_scan_interval_seconds = 1; + + ScopedInodeTestHooks hooks_guard; + std::atomic statvfs_calls {0}; + hooks_guard.hooks.statvfs_override = [&statvfs_calls](const std::string&, struct statvfs* vfs) { + statvfs_calls.fetch_add(1, std::memory_order_relaxed); + *vfs = {}; + vfs->f_files = 100; + vfs->f_ffree = 36; + return 0; + }; + hooks_guard.hooks.lstat_override = [](const std::string&, struct stat* st) { + *st = {}; + st->st_dev = 1; + return 0; + }; + hooks_guard.hooks.non_cache_override = [](const FSFileCacheStorage&) { return 0; }; + hooks_guard.hooks.cache_dir_override = [](const FSFileCacheStorage&) { return 0; }; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + const auto metadata_hash = BlockFileCache::hash("metadata_key"); + for (size_t i = 0; i < 64; ++i) { + add_metadata_entry(mgr, storage, metadata_hash, i); + } + + const auto orphan_hash = BlockFileCache::hash("ratio_skip_orphan"); + const auto orphan_dir = storage.get_path_in_local_cache_v3(orphan_hash); + create_regular_file(FSFileCacheStorage::get_path_in_local_cache_v3(orphan_dir, 0, false)); + + storage.run_leak_cleanup(&mgr); + EXPECT_TRUE(std::filesystem::exists( + FSFileCacheStorage::get_path_in_local_cache_v3(orphan_dir, 0, false))); + EXPECT_EQ(1, statvfs_calls.load(std::memory_order_relaxed)); +} + +TEST_F(FSFileCacheLeakCleanerTest, remove_orphan_and_tmp_files) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_batch_files = 1; + config::file_cache_leak_scan_pause_ms = 0; + config::file_cache_leak_scan_interval_seconds = 1; + + ScopedInodeTestHooks hooks_guard; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 100; + vfs->f_ffree = 10; // 90 used + return 0; + }; + hooks_guard.hooks.non_cache_override = [](const FSFileCacheStorage&) { return 0; }; + hooks_guard.hooks.cache_dir_override = [](const FSFileCacheStorage&) { return 0; }; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + auto kept_hash = BlockFileCache::hash("kept_hash"); + add_metadata_entry(mgr, storage, kept_hash, 0); + + auto kept_dir = storage.get_path_in_local_cache_v3(kept_hash); + auto kept_file = FSFileCacheStorage::get_path_in_local_cache_v3(kept_dir, 0, false); + auto tmp_file = FSFileCacheStorage::get_path_in_local_cache_v3(kept_dir, 8, true); + create_regular_file(kept_file, 'k'); + create_regular_file(tmp_file, 't'); + + auto orphan_hash = BlockFileCache::hash("orphan_hash"); + auto orphan_dir = storage.get_path_in_local_cache_v3(orphan_hash); + auto orphan_file = FSFileCacheStorage::get_path_in_local_cache_v3(orphan_dir, 4, false); + create_regular_file(orphan_file, 'o'); + + storage.run_leak_cleanup(&mgr); + + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + + EXPECT_TRUE(std::filesystem::exists(kept_file)); + EXPECT_FALSE(std::filesystem::exists(tmp_file)); + EXPECT_FALSE(std::filesystem::exists(orphan_file)); + EXPECT_FALSE(std::filesystem::exists(orphan_dir)); + + std::error_code ec; + fs::remove_all(dir, ec); +} + +TEST_F(FSFileCacheLeakCleanerTest, snapshot_metadata_for_hash_offsets_handles_missing_hash) { + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + config::file_cache_leak_scan_interval_seconds = 1; + + auto missing_hash = BlockFileCache::hash("missing_hash_case"); + auto offsets = storage.snapshot_metadata_for_hash_offsets(&mgr, missing_hash); + EXPECT_TRUE(offsets.empty()); + + add_metadata_entry(mgr, storage, missing_hash, 7); + add_metadata_entry(mgr, storage, missing_hash, 3); + + offsets = storage.snapshot_metadata_for_hash_offsets(&mgr, missing_hash); + std::sort(offsets.begin(), offsets.end()); + ASSERT_EQ(2, offsets.size()); + EXPECT_EQ(3u, offsets[0]); + EXPECT_EQ(7u, offsets[1]); +} + +TEST_F(FSFileCacheLeakCleanerTest, leak_cleaner_loop_catches_std_exception) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + std::atomic callback_count {0}; + auto sp = SyncPoint::get_instance(); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::initial_delay", + [](auto&& args) { *try_any_cast(args[0]) = 0; }); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::interval", + [](auto&& args) { *try_any_cast(args[0]) = 0; }); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::before_run", + [&storage, &callback_count](auto&&) { + callback_count.fetch_add(1, std::memory_order_relaxed); + storage._stop_leak_cleaner.store(true, std::memory_order_relaxed); + storage._leak_cleaner_cv.notify_all(); + throw std::runtime_error("injected std exception"); + }); + sp->enable_processing(); + + storage._stop_leak_cleaner.store(false, std::memory_order_relaxed); + std::thread worker([&]() { storage.leak_cleaner_loop(); }); + + for (int i = 0; i < 100 && callback_count.load(std::memory_order_relaxed) == 0; ++i) { + storage._leak_cleaner_cv.notify_all(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + storage._stop_leak_cleaner.store(true, std::memory_order_relaxed); + storage._leak_cleaner_cv.notify_all(); + worker.join(); + + sp->disable_processing(); + sp->clear_all_call_backs(); + + ASSERT_GE(callback_count.load(std::memory_order_relaxed), 1); +} + +TEST_F(FSFileCacheLeakCleanerTest, leak_cleaner_loop_catches_unknown_exception) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + struct NonStdException {}; + + std::atomic callback_count {0}; + auto sp = SyncPoint::get_instance(); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::initial_delay", + [](auto&& args) { *try_any_cast(args[0]) = 0; }); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::interval", + [](auto&& args) { *try_any_cast(args[0]) = 0; }); + sp->set_call_back("FSFileCacheStorage::leak_cleaner_loop::before_run", + [&storage, &callback_count](auto&&) { + callback_count.fetch_add(1, std::memory_order_relaxed); + storage._stop_leak_cleaner.store(true, std::memory_order_relaxed); + storage._leak_cleaner_cv.notify_all(); + throw NonStdException {}; + }); + sp->enable_processing(); + + storage._stop_leak_cleaner.store(false, std::memory_order_relaxed); + std::thread worker([&]() { storage.leak_cleaner_loop(); }); + + for (int i = 0; i < 100 && callback_count.load(std::memory_order_relaxed) == 0; ++i) { + storage._leak_cleaner_cv.notify_all(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + storage._stop_leak_cleaner.store(true, std::memory_order_relaxed); + storage._leak_cleaner_cv.notify_all(); + worker.join(); + + sp->disable_processing(); + sp->clear_all_call_backs(); + + ASSERT_GE(callback_count.load(std::memory_order_relaxed), 1); +} + +TEST_F(FSFileCacheLeakCleanerTest, run_leak_cleanup_removes_orphan_when_metadata_missing) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_fs_to_meta_ratio_threshold = 0.5; + config::file_cache_leak_grace_seconds = 0; + config::file_cache_leak_scan_interval_seconds = 1; + + ScopedInodeTestHooks hooks_guard; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 1000; + vfs->f_ffree = 900; // 100 used + return 0; + }; + hooks_guard.hooks.non_cache_override = [](const FSFileCacheStorage&) { return 0; }; + hooks_guard.hooks.cache_dir_override = [](const FSFileCacheStorage&) { return 0; }; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + auto hash = BlockFileCache::hash("zero_meta_orphan"); + auto key_dir = storage.get_path_in_local_cache_v3(hash); + fs::create_directories(key_dir); + auto orphan_path = FSFileCacheStorage::get_path_in_local_cache_v3(key_dir, 0, false); + create_regular_file(orphan_path, 'z'); + + auto dummy_hash = BlockFileCache::hash("dummy"); + add_metadata_entry(mgr, storage, dummy_hash, 0); + + storage.run_leak_cleanup(&mgr); + + auto prefix_dir = fs::path(key_dir).parent_path(); + EXPECT_FALSE(fs::exists(orphan_path)); + EXPECT_FALSE(fs::exists(key_dir)); + EXPECT_FALSE(fs::exists(prefix_dir)); +} + +TEST_F(FSFileCacheLeakCleanerTest, cleanup_handles_missing_base_directory) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir / "missing_root"); + fs::path missing_path(storage._cache_base_path); + if (fs::exists(missing_path)) { + fs::remove_all(missing_path); + } + + storage.cleanup_leaked_files(&mgr, 0); + EXPECT_FALSE(fs::exists(missing_path)); +} + +TEST_F(FSFileCacheLeakCleanerTest, cleanup_skips_invalid_prefixes_and_keys) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_grace_seconds = 0; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + fs::create_directories(dir); + create_regular_file((dir / "root_file").string()); // non-directory prefix entry + fs::create_directories(dir / FSFileCacheStorage::META_DIR_NAME); // meta dir skip + fs::create_directories(dir / "abcd"); // invalid prefix length + + auto prefix_dir = dir / "abc"; + fs::create_directories(prefix_dir); + create_regular_file((prefix_dir / "plain_file").string()); // !key_it->is_directory branch + fs::create_directories(prefix_dir / "deadbeef" /* missing '_' */); + fs::create_directories(prefix_dir / "zzzg000_0" /* invalid hex */); + fs::create_directories(prefix_dir / "123abc_bad" /* invalid expiration */); + + storage.cleanup_leaked_files(&mgr, 0); + + EXPECT_TRUE(fs::exists(prefix_dir)); + EXPECT_TRUE(fs::exists(dir / "abcd")); +} + +TEST_F(FSFileCacheLeakCleanerTest, cleanup_flush_candidates_when_empty) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_grace_seconds = 0; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + auto hash = BlockFileCache::hash("metadata_kept_hash"); + add_metadata_entry(mgr, storage, hash, 0); + + auto key_dir = storage.get_path_in_local_cache_v3(hash); + fs::create_directories(key_dir); + auto file_path = FSFileCacheStorage::get_path_in_local_cache_v3(key_dir, 0, false); + create_regular_file(file_path, 'm'); + + storage.cleanup_leaked_files(&mgr, 1); + + EXPECT_TRUE(fs::exists(file_path)); +} + +TEST_F(FSFileCacheLeakCleanerTest, cleanup_flush_candidates_remove_directories) { + ScopedLeakCleanerConfig guard; + config::file_cache_leak_grace_seconds = 0; + config::file_cache_leak_scan_batch_files = 2; + config::file_cache_leak_scan_interval_seconds = 1; + + auto dir = prepare_test_dir(); + FileCacheSettings settings = default_settings(); + BlockFileCache mgr(dir.string(), settings); + + FSFileCacheStorage storage; + setup_storage(storage, mgr, dir); + + auto hash = BlockFileCache::hash("cleanup_orphan_batch"); + auto key_dir = storage.get_path_in_local_cache_v3(hash); + fs::create_directories(key_dir); + auto orphan_path = FSFileCacheStorage::get_path_in_local_cache_v3(key_dir, 4, false); + create_regular_file(orphan_path, 'c'); + + storage.cleanup_leaked_files(&mgr, 0); + + auto prefix_dir = fs::path(key_dir).parent_path(); + EXPECT_FALSE(fs::exists(orphan_path)); + EXPECT_FALSE(fs::exists(key_dir)); + EXPECT_FALSE(fs::exists(prefix_dir)); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_file_count_handles_statvfs_failure) { + ScopedInodeTestHooks hooks_guard; + config::file_cache_leak_scan_interval_seconds = 1; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + errno = EIO; + return -1; + }; + + FSFileCacheStorage storage; + storage._cache_base_path = "/tmp/nonexistent_statvfs"; + EXPECT_EQ(0u, storage.estimate_file_count_from_inode()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_file_count_handles_zero_total_inodes) { + ScopedInodeTestHooks hooks_guard; + config::file_cache_leak_scan_interval_seconds = 1; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 0; + return 0; + }; + hooks_guard.hooks.lstat_override = [](const std::string&, struct stat* st) { + *st = {}; + st->st_dev = 1; + return 0; + }; + + FSFileCacheStorage storage; + storage._cache_base_path = "/tmp/cache_zero_inodes"; + EXPECT_EQ(0u, storage.estimate_file_count_from_inode()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_file_count_handles_lstat_failure) { + ScopedInodeTestHooks hooks_guard; + config::file_cache_leak_scan_interval_seconds = 1; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 100; + vfs->f_ffree = 10; + return 0; + }; + hooks_guard.hooks.lstat_override = [](const std::string&, struct stat*) { + errno = ENOENT; + return -1; + }; + + FSFileCacheStorage storage; + storage._cache_base_path = "/tmp/cache_lstat_failure"; + EXPECT_EQ(0u, storage.estimate_file_count_from_inode()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_file_count_handles_underflow) { + ScopedInodeTestHooks hooks_guard; + config::file_cache_leak_scan_interval_seconds = 1; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 200; + vfs->f_ffree = 150; + return 0; + }; + hooks_guard.hooks.lstat_override = [](const std::string&, struct stat* st) { + *st = {}; + st->st_dev = 9; + return 0; + }; + hooks_guard.hooks.non_cache_override = [](const FSFileCacheStorage&) { return 80; }; + hooks_guard.hooks.cache_dir_override = [](const FSFileCacheStorage&) { return 90; }; + + FSFileCacheStorage storage; + storage._cache_base_path = "/tmp/cache_underflow"; + EXPECT_EQ(0u, storage.estimate_file_count_from_inode()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_file_count_combines_counts) { + ScopedInodeTestHooks hooks_guard; + config::file_cache_leak_scan_interval_seconds = 1; + hooks_guard.hooks.statvfs_override = [](const std::string&, struct statvfs* vfs) { + *vfs = {}; + vfs->f_files = 500; + vfs->f_ffree = 200; + return 0; + }; + hooks_guard.hooks.lstat_override = [](const std::string&, struct stat* st) { + *st = {}; + st->st_dev = 7; + return 0; + }; + hooks_guard.hooks.non_cache_override = [](const FSFileCacheStorage&) { return 50; }; + hooks_guard.hooks.cache_dir_override = [](const FSFileCacheStorage&) { return 30; }; + + FSFileCacheStorage storage; + storage._cache_base_path = "/tmp/cache_estimation"; + EXPECT_EQ(220u, storage.estimate_file_count_from_inode()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_non_cache_inode_usage_counts_other_paths) { + auto root = prepare_test_dir("inode_non_cache_root"); + auto cache_dir = root / "cache"; + auto other_dir = root / "others"; + auto nested_dir = other_dir / "nested"; + fs::create_directories(cache_dir); + fs::create_directories(nested_dir); + create_regular_file((root / "root_file.bin").string()); + create_regular_file((other_dir / "leaf.txt").string()); + create_regular_file((nested_dir / "inner.bin").string()); + + FSFileCacheStorage storage; + storage._cache_base_path = cache_dir.string(); + + ScopedInodeTestHooks hooks_guard; + hooks_guard.hooks.find_mount_root_override = [root](const FSFileCacheStorage&, dev_t) { + return root; + }; + + EXPECT_EQ(6u, storage.estimate_non_cache_inode_usage()); +} + +TEST_F(FSFileCacheLeakCleanerTest, estimate_cache_directory_inode_usage_samples_prefixes) { + auto base = prepare_test_dir("inode_cache_directory"); + auto prefix_a = base / "abc"; + auto prefix_b = base / "def"; + fs::create_directories(prefix_a); + fs::create_directories(prefix_b); + fs::create_directories(base / FSFileCacheStorage::META_DIR_NAME); + fs::create_directories(base / "abcd"); + fs::create_directories(prefix_a / "deadbeef_0"); + fs::create_directories(prefix_b / "feed000_0"); + fs::create_directories(prefix_b / "feed000_1"); + fs::create_directories(prefix_b / "feed000_2"); + + FSFileCacheStorage storage; + storage._cache_base_path = base.string(); + + EXPECT_EQ(6u, storage.estimate_cache_directory_inode_usage()); +} + +TEST_F(FSFileCacheLeakCleanerTest, count_inodes_for_path_respects_exclusions) { + auto base = prepare_test_dir("inode_counting"); + auto include_dir = base / "include"; + auto exclude_dir = base / "exclude"; + fs::create_directories(include_dir); + fs::create_directories(exclude_dir); + create_regular_file((base / "root.bin").string()); + create_regular_file((include_dir / "child.bin").string()); + create_regular_file((exclude_dir / "skip.bin").string()); + + FSFileCacheStorage storage; + struct stat st {}; + ASSERT_EQ(0, lstat(base.c_str(), &st)); + std::unordered_set visited; + size_t count = storage.count_inodes_for_path(base, st.st_dev, exclude_dir, visited); + EXPECT_EQ(4u, count); +} + +TEST_F(FSFileCacheLeakCleanerTest, is_cache_prefix_directory_filters_entries) { + auto base = prepare_test_dir("inode_prefix_filter"); + auto valid = base / "abc"; + auto invalid = base / "abcd"; + auto meta_dir = base / FSFileCacheStorage::META_DIR_NAME; + fs::create_directories(valid); + fs::create_directories(invalid); + fs::create_directories(meta_dir); + create_regular_file((base / "plain_file").string()); + + FSFileCacheStorage storage; + storage._cache_base_path = base.string(); + + EXPECT_TRUE(storage.is_cache_prefix_directory(fs::directory_entry(valid))); + EXPECT_FALSE(storage.is_cache_prefix_directory(fs::directory_entry(invalid))); + EXPECT_FALSE(storage.is_cache_prefix_directory(fs::directory_entry(meta_dir))); + EXPECT_FALSE(storage.is_cache_prefix_directory(fs::directory_entry(base / "plain_file"))); +} + +} // namespace doris::io