@@ -106,34 +106,33 @@ class MarkLoader
106106 col_mark_fname);
107107
108108 const auto & merged_file_info = info_iter->second ;
109- auto file_path = dmfile_meta->mergedPath (merged_file_info.number );
110- auto encrypt_path = dmfile_meta->encryptionMergedPath (merged_file_info.number );
111- auto offset = merged_file_info.offset ;
112- auto data_size = merged_file_info.size ;
109+ const auto file_path = dmfile_meta->mergedPath (merged_file_info.number );
110+ const auto offset = merged_file_info.offset ;
111+ const auto data_size = merged_file_info.size ;
113112
114113 if (data_size == 0 )
115114 return res;
116115
117116 // First, read from merged file to get the raw data(contains the header)
117+ // Note that we use min(`data_size`, checksum_frame_size) as the size of buffer size in order
118+ // to minimize read amplification in the merged file.
118119 auto buffer = ReadBufferFromRandomAccessFileBuilder::build (
119120 reader.file_provider ,
120121 file_path,
121- encrypt_path ,
122- reader.dmfile ->getConfiguration ()->getChecksumFrameLength (),
122+ dmfile_meta-> encryptionMergedPath (merged_file_info. number ) ,
123+ std::min (data_size, reader.dmfile ->getConfiguration ()->getChecksumFrameLength () ),
123124 read_limiter);
124125 buffer.seek (offset);
125126
126127 // Read the raw data into memory. It is OK because the mark merged into
127128 // merged_file is small enough.
128- String raw_data;
129- raw_data.resize (data_size);
129+ String raw_data (data_size, ' \0 ' );
130130 buffer.read (reinterpret_cast <char *>(raw_data.data ()), data_size);
131131
132- // Then read from the buffer based on the raw data
132+ // Then read from the buffer based on the raw data. The buffer size is min(data.size(), checksum_frame_size)
133133 auto buf = ChecksumReadBufferBuilder::build (
134134 std::move (raw_data),
135135 file_path, // just for debug, the buffer is part of the merged file
136- reader.dmfile ->getConfiguration ()->getChecksumFrameLength (),
137136 reader.dmfile ->getConfiguration ()->getChecksumAlgorithm (),
138137 reader.dmfile ->getConfiguration ()->getChecksumFrameLength ());
139138 buf->readBig (reinterpret_cast <char *>(res->data ()), bytes_size);
@@ -234,9 +233,10 @@ std::unique_ptr<CompressedSeekableReaderBuffer> ColumnReadStream::buildColDataRe
234233{
235234 const auto * dmfile_meta = typeid_cast<const DMFileMetaV2 *>(reader.dmfile ->meta .get ());
236235 assert (dmfile_meta != nullptr );
237- const auto & info = dmfile_meta->merged_sub_file_infos .find (colDataFileName (file_name_base));
238- if (info == dmfile_meta->merged_sub_file_infos .end ())
236+ const auto & info_iter = dmfile_meta->merged_sub_file_infos .find (colDataFileName (file_name_base));
237+ if (info_iter == dmfile_meta->merged_sub_file_infos .end ())
239238 {
239+ // Not merged into merged file, read from the original data file.
240240 return CompressedReadBufferFromFileBuilder::build (
241241 reader.file_provider ,
242242 reader.dmfile ->colDataPath (file_name_base),
@@ -247,32 +247,31 @@ std::unique_ptr<CompressedSeekableReaderBuffer> ColumnReadStream::buildColDataRe
247247 reader.dmfile ->getConfiguration ()->getChecksumFrameLength ());
248248 }
249249
250- assert (info != dmfile_meta->merged_sub_file_infos .end ());
251- auto file_path = dmfile_meta->mergedPath (info->second .number );
252- auto encrypt_path = dmfile_meta->encryptionMergedPath (info->second .number );
253- auto offset = info->second .offset ;
254- auto size = info->second .size ;
250+ assert (info_iter != dmfile_meta->merged_sub_file_infos .end ());
251+ auto file_path = dmfile_meta->mergedPath (info_iter->second .number );
252+ const auto offset = info_iter->second .offset ;
253+ const auto data_size = info_iter->second .size ;
255254
256255 // First, read from merged file to get the raw data(contains the header)
256+ // Note that we use min(`data_size`, checksum_frame_size) as the size of buffer size in order
257+ // to minimize read amplification in the merged file.
257258 auto buffer = ReadBufferFromRandomAccessFileBuilder::build (
258259 reader.file_provider ,
259260 file_path,
260- encrypt_path ,
261- reader.dmfile ->getConfiguration ()->getChecksumFrameLength (),
261+ dmfile_meta-> encryptionMergedPath (info_iter-> second . number ) ,
262+ std::min (data_size, reader.dmfile ->getConfiguration ()->getChecksumFrameLength () ),
262263 read_limiter);
263264 buffer.seek (offset);
264265
265266 // Read the raw data into memory. It is OK because the mark merged into
266267 // merged_file is small enough.
267- String raw_data;
268- raw_data.resize (size);
269- buffer.read (reinterpret_cast <char *>(raw_data.data ()), size);
268+ String raw_data (data_size, ' \0 ' );
269+ buffer.read (reinterpret_cast <char *>(raw_data.data ()), data_size);
270270
271- // Then read from the buffer based on the raw data
271+ // Then read from the buffer based on the raw data. The buffer size is min(data.size(), checksum_frame_size)
272272 return CompressedReadBufferFromFileBuilder::build (
273273 std::move (raw_data),
274274 file_path,
275- reader.dmfile ->getConfiguration ()->getChecksumFrameLength (),
276275 reader.dmfile ->getConfiguration ()->getChecksumAlgorithm (),
277276 reader.dmfile ->getConfiguration ()->getChecksumFrameLength ());
278277}
0 commit comments