Skip to content
108 changes: 61 additions & 47 deletions src/commands/checkpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,24 @@ pub fn run(
storage_start.elapsed()
));

// Early exit for human only
// Read checkpoints once and reuse throughout the function.
// This eliminates multiple redundant read_all_checkpoints() calls that were
// previously the #1 cause of memory overflow (each read deserializes the entire
// JSONL file into memory).
let read_checkpoints_start = Instant::now();
let mut checkpoints = working_log.read_all_checkpoints()?;
debug_log(&format!(
"[BENCHMARK] Reading {} checkpoints took {:?}",
checkpoints.len(),
read_checkpoints_start.elapsed()
));

// Early exit for human only — must run BEFORE reset so we can inspect existing data.
if is_pre_commit {
let has_no_ai_edits = working_log
.all_ai_touched_files()
.map(|files| files.is_empty())
.unwrap_or(true);
let has_no_ai_edits = checkpoints.iter().all(|cp| {
cp.entries.is_empty()
|| (cp.kind != CheckpointKind::AiAgent && cp.kind != CheckpointKind::AiTab)
});

// Also check for INITIAL attributions - these are AI attributions from previous
// commits that weren't staged (e.g., after an amend). We must process these.
Expand All @@ -178,6 +190,12 @@ pub fn run(
}
}

// Reset working log after the early-exit check (so existing data is inspected first)
if reset {
working_log.reset_working_log()?;
checkpoints.clear();
}

// Set dirty files if available
if let Some(dirty_files) = agent_run_result
.as_ref()
Expand Down Expand Up @@ -262,6 +280,7 @@ pub fn run(
pathspec_start.elapsed()
));

// Pass pre-loaded checkpoints to avoid redundant reads inside get_all_tracked_files
let files_start = Instant::now();
let files = get_all_tracked_files(
repo,
Expand All @@ -270,27 +289,14 @@ pub fn run(
pathspec_filter,
is_pre_commit,
&ignore_matcher,
Some(&checkpoints),
)?;
debug_log(&format!(
"[BENCHMARK] get_all_tracked_files found {} files, took {:?}",
files.len(),
files_start.elapsed()
));

let read_checkpoints_start = Instant::now();
let mut checkpoints = if reset {
// If reset flag is set, start with an empty working log
working_log.reset_working_log()?;
Vec::new()
} else {
working_log.read_all_checkpoints()?
};
debug_log(&format!(
"[BENCHMARK] Reading {} checkpoints took {:?}",
checkpoints.len(),
read_checkpoints_start.elapsed()
));

if show_working_log {
if checkpoints.is_empty() {
eprintln!("No working log entries found.");
Expand Down Expand Up @@ -607,6 +613,7 @@ fn get_all_tracked_files(
edited_filepaths: Option<&Vec<String>>,
is_pre_commit: bool,
ignore_matcher: &IgnoreMatcher,
preloaded_checkpoints: Option<&[Checkpoint]>,
) -> Result<Vec<String>, GitAiError> {
let mut files: HashSet<String> = edited_filepaths
.map(|paths| {
Expand Down Expand Up @@ -659,28 +666,38 @@ fn get_all_tracked_files(
initial_read_start.elapsed()
));

// Use pre-loaded checkpoints if available, otherwise read from disk.
// This eliminates redundant read_all_checkpoints() calls when the caller
// already has the data loaded (e.g., checkpoint::run reads once and passes it through).
let owned_checkpoints;
let checkpoint_data: &[Checkpoint] = match preloaded_checkpoints {
Some(data) => data,
None => {
owned_checkpoints = working_log.read_all_checkpoints().unwrap_or_default();
&owned_checkpoints
}
};

let checkpoints_read_start = Instant::now();
if let Ok(working_log_data) = working_log.read_all_checkpoints() {
for checkpoint in &working_log_data {
for entry in &checkpoint.entries {
// Normalize path separators to forward slashes
let normalized_path = normalize_to_posix(&entry.file);
// Filter out paths outside the repository to prevent git command failures
if !is_path_in_repo(&normalized_path) {
debug_log(&format!(
"Skipping checkpoint file outside repository: {}",
normalized_path
));
continue;
}
if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) {
continue;
}
if !files.contains(&normalized_path) {
// Check if it's a text file before adding
if is_text_file(working_log, &normalized_path) {
files.insert(normalized_path);
}
for checkpoint in checkpoint_data {
for entry in &checkpoint.entries {
// Normalize path separators to forward slashes
let normalized_path = normalize_to_posix(&entry.file);
// Filter out paths outside the repository to prevent git command failures
if !is_path_in_repo(&normalized_path) {
debug_log(&format!(
"Skipping checkpoint file outside repository: {}",
normalized_path
));
continue;
}
if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) {
continue;
}
if !files.contains(&normalized_path) {
// Check if it's a text file before adding
if is_text_file(working_log, &normalized_path) {
files.insert(normalized_path);
}
}
}
Expand All @@ -690,13 +707,10 @@ fn get_all_tracked_files(
checkpoints_read_start.elapsed()
));

let has_ai_checkpoints = if let Ok(working_log_data) = working_log.read_all_checkpoints() {
working_log_data.iter().any(|checkpoint| {
checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab
})
} else {
false
};
// Use same checkpoint data to check for AI checkpoints (no extra read)
let has_ai_checkpoints = checkpoint_data.iter().any(|checkpoint| {
checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab
});

let status_files_start = Instant::now();
let mut results_for_tracked_files = if is_pre_commit && !has_ai_checkpoints {
Expand Down
129 changes: 75 additions & 54 deletions src/git/repo_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};

/// Initial attributions data structure stored in the INITIAL file
Expand Down Expand Up @@ -321,8 +322,7 @@ impl PersistedWorkingLog {

/* append checkpoint */
pub fn append_checkpoint(&self, checkpoint: &Checkpoint) -> Result<(), GitAiError> {
// Read existing checkpoints
let mut checkpoints = self.read_all_checkpoints().unwrap_or_default();
let checkpoints_file = self.dir.join("checkpoints.jsonl");

// Create a copy, potentially without transcript to reduce storage size.
// Transcripts are refetched in update_prompts_to_latest() before post-commit
Expand Down Expand Up @@ -366,15 +366,68 @@ impl PersistedWorkingLog {
storage_checkpoint.transcript = None;
}

// Add the new checkpoint
checkpoints.push(storage_checkpoint);
// Collect files from the new checkpoint to know which older attributions to prune
let new_files: HashSet<String> = storage_checkpoint
.entries
.iter()
.map(|e| e.file.clone())
.collect();

// If the file doesn't exist yet or the new checkpoint has no overlapping files,
// we can potentially skip the rewrite. But we always need to prune to keep the
// file small during long agent loops.
let needs_prune = !new_files.is_empty() && checkpoints_file.exists();

if needs_prune {
// Streaming prune-and-rewrite: read existing checkpoints one at a time,
// prune char attributions for files that the new checkpoint supersedes,
// write to a temp file, then append the new checkpoint and rename.
// Peak memory: one checkpoint at a time + the new checkpoint.
let tmp_file = self.dir.join("checkpoints.jsonl.tmp");
{
let existing = fs::File::open(&checkpoints_file)?;
let reader = BufReader::new(existing);
let out = fs::File::create(&tmp_file)?;
let mut writer = std::io::BufWriter::new(out);

for line_result in reader.lines() {
let line = line_result?;
if line.trim().is_empty() {
continue;
}
let mut cp: Checkpoint = serde_json::from_str(&line)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;

// Prune char-level attributions from entries whose file is superseded
// by the new checkpoint. Only the newest checkpoint per file needs
// char-level precision.
for entry in &mut cp.entries {
if new_files.contains(&entry.file) {
entry.attributions.clear();
}
}

let json_line = serde_json::to_string(&cp)?;
writeln!(writer, "{}", json_line)?;
}

// Prune char-level attributions from older checkpoints for the same files
// Only the most recent checkpoint per file needs char-level precision
self.prune_old_char_attributions(&mut checkpoints);
// Append the new checkpoint
let json_line = serde_json::to_string(&storage_checkpoint)?;
writeln!(writer, "{}", json_line)?;
writer.flush()?;
}
fs::rename(&tmp_file, &checkpoints_file)?;
} else {
// No existing file or no overlapping files — simple append
let json_line = serde_json::to_string(&storage_checkpoint)?;
let mut file = fs::OpenOptions::new()
.create(true)
.append(true)
.open(&checkpoints_file)?;
writeln!(file, "{}", json_line)?;
}

// Write all checkpoints back
self.write_all_checkpoints(&checkpoints)
Ok(())
}

pub fn read_all_checkpoints(&self) -> Result<Vec<Checkpoint>, GitAiError> {
Expand All @@ -384,16 +437,19 @@ impl PersistedWorkingLog {
return Ok(Vec::new());
}

let content = fs::read_to_string(&checkpoints_file)?;
// Use BufReader to stream line-by-line instead of loading entire file into memory.
// This avoids holding both the full file string AND the parsed structs simultaneously.
let file = fs::File::open(&checkpoints_file)?;
let reader = BufReader::new(file);
let mut checkpoints = Vec::new();

// Parse JSONL file - each line is a separate JSON object
for line in content.lines() {
for line_result in reader.lines() {
let line = line_result?;
if line.trim().is_empty() {
continue;
}

let checkpoint: Checkpoint = serde_json::from_str(line)
let checkpoint: Checkpoint = serde_json::from_str(&line)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;

if checkpoint.api_version != CHECKPOINT_API_VERSION {
Expand Down Expand Up @@ -454,55 +510,20 @@ impl PersistedWorkingLog {
Ok(migrated_checkpoints)
}

/// Remove char-level attributions from all but the most recent checkpoint per file.
/// This reduces storage size while preserving precision for the entries that matter.
/// Only the most recent checkpoint entry for each file is used when computing new entries.
fn prune_old_char_attributions(&self, checkpoints: &mut [Checkpoint]) {
// Track which checkpoint index has the most recent entry for each file
// Iterate from newest to oldest
let mut newest_for_file: HashMap<String, usize> = HashMap::new();

for (checkpoint_idx, checkpoint) in checkpoints.iter().enumerate().rev() {
for entry in &checkpoint.entries {
newest_for_file
.entry(entry.file.clone())
.or_insert(checkpoint_idx);
}
}

// Clear attributions from entries that aren't the most recent for their file
for (checkpoint_idx, checkpoint) in checkpoints.iter_mut().enumerate() {
for entry in &mut checkpoint.entries {
if let Some(&newest_idx) = newest_for_file.get(&entry.file)
&& checkpoint_idx != newest_idx
{
entry.attributions.clear();
}
}
}
}

/// Write all checkpoints to the JSONL file, replacing any existing content
/// Write all checkpoints to the JSONL file, replacing any existing content.
/// Note: Unlike append_checkpoint(), this preserves transcripts because it's used
/// by post-commit after transcripts have been refetched and need to be preserved
/// for from_just_working_log() to read them.
pub fn write_all_checkpoints(&self, checkpoints: &[Checkpoint]) -> Result<(), GitAiError> {
let checkpoints_file = self.dir.join("checkpoints.jsonl");

// Serialize all checkpoints to JSONL
let mut lines = Vec::new();
for checkpoint in checkpoints {
let file = fs::File::create(&checkpoints_file)?;
let mut writer = std::io::BufWriter::new(file);
for checkpoint in checkpoints.iter() {
let json_line = serde_json::to_string(checkpoint)?;
lines.push(json_line);
}

// Write all lines to file
let content = lines.join("\n");
if !content.is_empty() {
fs::write(&checkpoints_file, format!("{}\n", content))?;
} else {
fs::write(&checkpoints_file, "")?;
writeln!(writer, "{}", json_line)?;
}
writer.flush()?;

Ok(())
}
Expand Down
Loading