From e4cdc6576a135b872f62ee4b275f95dc30b5a494 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Sat, 21 Mar 2026 22:20:47 +0800 Subject: [PATCH 1/4] Update MaxTime for GetActualDocumentCount to keep aligned with GetDocumentCount method --- OnlineMongoMigrationProcessor/Helpers/Mongo/MongoHelper.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OnlineMongoMigrationProcessor/Helpers/Mongo/MongoHelper.cs b/OnlineMongoMigrationProcessor/Helpers/Mongo/MongoHelper.cs index 7373b94..54adbb3 100644 --- a/OnlineMongoMigrationProcessor/Helpers/Mongo/MongoHelper.cs +++ b/OnlineMongoMigrationProcessor/Helpers/Mongo/MongoHelper.cs @@ -118,7 +118,7 @@ public static long GetActualDocumentCount(IMongoCollection collect { FilterDefinition? userFilter = GetFilterDoc(mu.UserFilter); var filter = userFilter ?? Builders.Filter.Empty; - return collection.CountDocuments(filter); + return collection.CountDocuments(filter, new CountOptions { MaxTime = TimeSpan.FromMinutes(10) }); } /// From e1b06255a0381ba64cdfd0c2a13e07c04f1e9d16 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Sat, 21 Mar 2026 22:59:10 +0800 Subject: [PATCH 2/4] Refactor SamplePartitioner to optimize chunk and sample count calculations for MongoDB dump scenarios --- .../Partitioner/SamplePartitioner.cs | 89 +++++++++---------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs index c41e8b4..61c84f0 100644 --- a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs +++ b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs @@ -143,62 +143,48 @@ public static int GetMaxSamples() } } - if (chunkCount > adjustedMaxSamples) + bool usePaginationPartitioner = dataType != DataType.ObjectId && config.NonObjectIdPartitioner == PartitionerType.UsePagination; + + if (optimizeForMongoDump) { - int count = 2; - long newCount = docCountByType / ((long)minDocsPerSegment * count); - while (newCount > adjustedMaxSamples) - { - count++; + // DumpAndRestore: oversample then pick equidistant quantile boundaries + chunkCount = Math.Max(1, (int)Math.Ceiling((double)docCountByType / minDocsPerChunk)); + segmentCount = 1; - // Check for potential overflow before multiplication - long multiplier = (long)minDocsPerSegment * count; - if (multiplier <= 0 || multiplier > docCountByType) - { - break; - } - newCount = docCountByType / multiplier; - MigrationJobContext.AddVerboseLog($"SamplePartitioner.Calculating adjustedMaxSamples: collection={collection.CollectionNamespace}, newCount={newCount}, dataType={dataType}, docCountByType={docCountByType}, multiplier={multiplier}"); - } - + // Oversample ~200 per chunk, cap at 4% of doc count and hard cap + long estimatedAvgObjSize = docCountByType > 0 ? Math.Max(1, minDocsPerChunk > 0 ? minDocsPerChunk : 1024) : 1024; + sampleCount = (int)Math.Min( + (long)chunkCount * 200, + Math.Min((long)Math.Floor(docCountByType * 0.04), adjustedMaxSamples)); + sampleCount = Math.Max(sampleCount, chunkCount); // at least chunkCount samples - log.WriteLine($"Requested chunk count {chunkCount} exceeds maximum samples {adjustedMaxSamples} for {collection.CollectionNamespace}. Adjusting to {newCount}", LogType.Warning); - chunkCount = (int)newCount; + MigrationJobContext.AddVerboseLog($"SamplePartitioner DumpAndRestore: collection={collection.CollectionNamespace}, dataType={dataType}, chunkCount={chunkCount}, sampleCount={sampleCount}"); } + else + { + // MongoDriver path: compute segments for parallel writes within each chunk + chunkCount = Math.Max(1, (int)Math.Ceiling((double)docCountByType / minDocsPerChunk)); + docsInChunk = docCountByType / chunkCount; + segmentCount = Math.Min( + Math.Max(1, (int)Math.Ceiling((double)docsInChunk / minDocsPerSegment)), + GetMaxSegments() + ); - // Ensure minimum documents per chunk - chunkCount = Math.Max(1, (int)Math.Ceiling((double)docCountByType / minDocsPerChunk)); - docsInChunk = docCountByType / chunkCount; - - // Calculate segments based on documents per chunk - segmentCount = Math.Min( - Math.Max(1, (int)Math.Ceiling((double)docsInChunk / minDocsPerSegment)), - GetMaxSegments() - ); - - - // Calculate the total sample count - sampleCount = Math.Min(chunkCount * segmentCount, adjustedMaxSamples); - - - MigrationJobContext.AddVerboseLog($"SamplePartitioner.Calculating sampleCount: collection={collection.CollectionNamespace}, dataType={dataType}, segmentCount={segmentCount}, sampleCount{sampleCount}"); - - // Adjust segments per chunk based on the new sample count - segmentCount = Math.Max(1, sampleCount / chunkCount); - - bool usePaginationPartitioner = dataType != DataType.ObjectId && config.NonObjectIdPartitioner == PartitionerType.UsePagination; + sampleCount = Math.Min(chunkCount * segmentCount, adjustedMaxSamples); + segmentCount = Math.Max(1, sampleCount / chunkCount); - // Optimize for non-dump scenarios - if (!optimizeForMongoDump && !usePaginationPartitioner) - { - while (chunkCount > segmentCount && segmentCount < GetMaxSegments()) + if (!usePaginationPartitioner) { - chunkCount--; - segmentCount++; + while (chunkCount > segmentCount && segmentCount < GetMaxSegments()) + { + chunkCount--; + segmentCount++; + } + chunkCount = sampleCount / segmentCount; } - chunkCount = sampleCount / segmentCount; + MigrationJobContext.AddVerboseLog($"SamplePartitioner MongoDriver: collection={collection.CollectionNamespace}, dataType={dataType}, chunkCount={chunkCount}, segmentCount={segmentCount}, sampleCount={sampleCount}"); } MigrationJobContext.AddVerboseLog($"SamplePartitioner.Calculating chunkCount: collection={collection.CollectionNamespace}, dataType={dataType}, chunkCount={chunkCount}"); @@ -355,6 +341,17 @@ public static int GetMaxSamples() return null; } // Step 3: Calculate partition boundaries + // For DumpAndRestore: pick equidistant quantile boundaries from oversampled sorted list + if (optimizeForMongoDump && partitionValues.Count > chunkCount) + { + var quantileBoundaries = new List(); + for (int k = 1; k < chunkCount; k++) + { + int idx = (int)((long)k * partitionValues.Count / chunkCount); + quantileBoundaries.Add(partitionValues[idx]); + } + partitionValues = quantileBoundaries; + } chunkBoundaries = ConvertToBoundaries(partitionValues, segmentCount); From 96c9bce4e34cd3d68e3d6d9f668d7ed402c69bc7 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Sat, 21 Mar 2026 23:42:21 +0800 Subject: [PATCH 3/4] Fix partition quantile calculation to include all chunks in MongoDB dump optimization --- OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs index 61c84f0..fad8e45 100644 --- a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs +++ b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs @@ -345,7 +345,7 @@ public static int GetMaxSamples() if (optimizeForMongoDump && partitionValues.Count > chunkCount) { var quantileBoundaries = new List(); - for (int k = 1; k < chunkCount; k++) + for (int k = 0; k < chunkCount; k++) { int idx = (int)((long)k * partitionValues.Count / chunkCount); quantileBoundaries.Add(partitionValues[idx]); From 370b361d36dd6752392ee36e3708cf844c38f950 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Sun, 22 Mar 2026 00:11:48 +0800 Subject: [PATCH 4/4] Optimize sample count calculation in CreatePartitions for MongoDB dump scenarios --- .../Partitioner/SamplePartitioner.cs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs index fad8e45..c8db246 100644 --- a/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs +++ b/OnlineMongoMigrationProcessor/Partitioner/SamplePartitioner.cs @@ -150,12 +150,10 @@ public static int GetMaxSamples() // DumpAndRestore: oversample then pick equidistant quantile boundaries chunkCount = Math.Max(1, (int)Math.Ceiling((double)docCountByType / minDocsPerChunk)); segmentCount = 1; + adjustedMaxSamples = (int)Math.Min((long)Math.Floor(docCountByType * 0.04), 500000); - // Oversample ~200 per chunk, cap at 4% of doc count and hard cap - long estimatedAvgObjSize = docCountByType > 0 ? Math.Max(1, minDocsPerChunk > 0 ? minDocsPerChunk : 1024) : 1024; - sampleCount = (int)Math.Min( - (long)chunkCount * 200, - Math.Min((long)Math.Floor(docCountByType * 0.04), adjustedMaxSamples)); + // Oversample ~200 per chunk, capped by adjustedMaxSamples + sampleCount = (int)Math.Min((long)chunkCount * 200, adjustedMaxSamples); sampleCount = Math.Max(sampleCount, chunkCount); // at least chunkCount samples MigrationJobContext.AddVerboseLog($"SamplePartitioner DumpAndRestore: collection={collection.CollectionNamespace}, dataType={dataType}, chunkCount={chunkCount}, sampleCount={sampleCount}");