diff --git a/diskann-benchmark-runner/.clippy.toml b/diskann-benchmark-runner/.clippy.toml new file mode 100644 index 000000000..7bada5473 --- /dev/null +++ b/diskann-benchmark-runner/.clippy.toml @@ -0,0 +1,3 @@ +allow-unwrap-in-tests = true +allow-expect-in-tests = true +allow-panic-in-tests = true diff --git a/diskann-benchmark-runner/Cargo.toml b/diskann-benchmark-runner/Cargo.toml index f4c8b438c..33cb63d0d 100644 --- a/diskann-benchmark-runner/Cargo.toml +++ b/diskann-benchmark-runner/Cargo.toml @@ -16,8 +16,12 @@ serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } thiserror.workspace = true -[lints] -workspace = true +[lints.clippy] +undocumented_unsafe_blocks = "warn" +uninlined_format_args = "allow" +unwrap_used = "warn" +expect_used = "warn" +panic = "warn" [dev-dependencies] tempfile.workspace = true diff --git a/diskann-benchmark-runner/src/app.rs b/diskann-benchmark-runner/src/app.rs index 6dcda408a..77f7e95bb 100644 --- a/diskann-benchmark-runner/src/app.rs +++ b/diskann-benchmark-runner/src/app.rs @@ -3,11 +3,69 @@ * Licensed under the MIT license. */ +//! The CLI frontend for benchmark applications built with this crate. +//! +//! [`App`] provides a [`clap`]-based command line interface that handles input parsing, +//! benchmark dispatch, and regression checking. Consumers build a binary by registering +//! [`Input`](crate::Input)s and [`Benchmark`](crate::Benchmark)s, then forwarding to +//! [`App::parse`] and [`App::run`]. +//! +//! # Subcommands +//! +//! ## Standard Workflow +//! +//! * `inputs [NAME]`: List available input kinds, or describe one by name. +//! * `benchmarks`: List registered benchmarks and their descriptions. +//! * `skeleton`: Print a skeleton input JSON file. +//! * `run --input-file --output-file [--dry-run]`: Run benchmarks. +//! +//! ## Regression Checks +//! +//! These are accessed via `check `: +//! +//! * `check skeleton`: Print a skeleton tolerance JSON file. +//! * `check tolerances [NAME]`: List tolerance kinds, or describe one by name. +//! * `check verify --tolerances --input-file `: Validate a tolerance file +//! against an input file. +//! * `check run --tolerances --input-file --before --after [--output-file ]`: +//! Run regression checks. +//! +//! # Example +//! +//! A typical binary using this crate: +//! +//! ```rust,no_run +//! use diskann_benchmark_runner::{App, registry}; +//! +//! fn main() -> anyhow::Result<()> { +//! let mut inputs = registry::Inputs::new(); +//! // inputs.register::()?; +//! +//! let mut benchmarks = registry::Benchmarks::new(); +//! // benchmarks.register::("my-bench"); +//! // benchmarks.register_regression::("my-regression"); +//! +//! let app = App::parse(); +//! let mut output = diskann_benchmark_runner::output::default(); +//! app.run(&inputs, &benchmarks, &mut output) +//! } +//! ``` +//! +//! # Regression Workflow +//! +//! 1. Run benchmarks twice (e.g. before and after a code change) with `run`, producing +//! two output files. +//! 2. Author a tolerance file describing acceptable variation (use `check skeleton` and +//! `check tolerances` for guidance). +//! 3. Validate the tolerance file with `check verify`. +//! 4. Compare the two output files with `check run`. + use std::{io::Write, path::PathBuf}; use clap::{Parser, Subcommand}; use crate::{ + internal, jobs::{self, Jobs}, output::Output, registry, @@ -40,6 +98,47 @@ pub enum Commands { #[arg(long, action)] dry_run: bool, }, + #[command(subcommand)] + Check(Check), +} + +/// Subcommands for regression check operations. +#[derive(Debug, Subcommand)] +pub enum Check { + /// Provide a skeleton of the overall tolerance files. + Skeleton, + /// List all the tolerance inputs accepted by the benchmark executable. + Tolerances { + /// Describe the layout for the named tolerance kind. + describe: Option, + }, + /// Verify the tolerance file with the accompanying input file. + Verify { + /// The tolerance file to check. + #[arg(long = "tolerances")] + tolerances: PathBuf, + /// The benchmark input file used to generate the data that will be compared. + #[arg(long = "input-file")] + input_file: PathBuf, + }, + /// Run regression checks against before/after output files. + Run { + /// The tolerance file to check. + #[arg(long = "tolerances")] + tolerances: PathBuf, + /// The benchmark input file used to generate the data that will be compared. + #[arg(long = "input-file")] + input_file: PathBuf, + /// The `--output-file` from a benchmark to use as a baseline. + #[arg(long = "before")] + before: PathBuf, + /// The `--output-file` that will be checked for regression against `before`. + #[arg(long = "after")] + after: PathBuf, + /// Optional path to write the JSON check results. + #[arg(long = "output-file")] + output_file: Option, + }, } /// The CLI used to drive a benchmark application. @@ -50,7 +149,7 @@ pub struct App { } impl App { - /// Construct [`Self`] by parsing commandline arguments from [`std::env::args]`. + /// Construct [`Self`] by parsing commandline arguments from [`std::env::args`]. /// /// This simply redirects to [`clap::Parser::parse`] and is provided to allow parsing /// without the [`clap::Parser`] trait in scope. @@ -75,7 +174,7 @@ impl App { Self { command } } - /// Run the application using the registered `inputs` and `outputs`. + /// Run the application using the registered `inputs` and `benchmarks`. pub fn run( &self, inputs: ®istry::Inputs, @@ -210,9 +309,112 @@ impl App { Checkpoint::new(&serialized, &results, output_file)?.save()?; } } + // Extensions + Commands::Check(check) => return self.check(check, inputs, benchmarks, output), }; Ok(()) } + + // Extensions + fn check( + &self, + check: &Check, + inputs: ®istry::Inputs, + benchmarks: ®istry::Benchmarks, + mut output: &mut dyn Output, + ) -> anyhow::Result<()> { + match check { + Check::Skeleton => { + let message = "Skeleton tolerance file.\n\n\ + Each tolerance is paired with an input that is structurally\n\ + matched with an entry in the corresponding `--input-file`.\n\n\ + This allow a single tolerance entry to be applied to multiple\n\ + benchmark runs as long as this structural mapping is unambiguous.\n"; + + writeln!(output, "{}", message)?; + writeln!(output, "{}", internal::regression::Raw::example())?; + Ok(()) + } + Check::Tolerances { describe } => { + let tolerances = benchmarks.tolerances(); + + match describe { + Some(name) => match tolerances.get(&**name) { + Some(registered) => { + let repr = internal::regression::RawInner::new( + jobs::Unprocessed::new( + "".to_string(), + serde_json::Value::Object(Default::default()), + ), + jobs::Unprocessed::format_input(registered.tolerance)?, + ); + + write!( + output, + "The example JSON representation for \"{}\" is shown below.\n\ + Populate the \"input\" field with a compatible benchmark input.\n\ + Matching will be performed by partial structural map on the input.\n\n", + name + )?; + writeln!(output, "{}", serde_json::to_string_pretty(&repr)?)?; + Ok(()) + } + None => { + writeln!(output, "No tolerance input found for \"{}\"", name)?; + Ok(()) + } + }, + None => { + writeln!(output, "Available tolerance kinds are listed below.")?; + + // Print the registered tolerance files in alphabetical order. + let mut keys: Vec<_> = tolerances.keys().collect(); + keys.sort(); + for k in keys { + // This access should not panic - we just obtained all the keys. + let registered = &tolerances[k]; + writeln!(output, " {}", registered.tolerance.tag())?; + for pair in registered.regressions.iter() { + writeln!( + output, + " - \"{}\" => \"{}\"", + pair.input_tag(), + pair.name(), + )?; + } + } + Ok(()) + } + } + } + Check::Verify { + tolerances, + input_file, + } => { + // For verification - we merely check that we can successfully construct + // the regression `Checks` struct. It performs all the necessary preflight + // checks. + let benchmarks = benchmarks.tolerances(); + let _ = + internal::regression::Checks::new(tolerances, input_file, inputs, &benchmarks)?; + Ok(()) + } + Check::Run { + tolerances, + input_file, + before, + after, + output_file, + } => { + let registered = benchmarks.tolerances(); + let checks = + internal::regression::Checks::new(tolerances, input_file, inputs, ®istered)?; + let jobs = checks.jobs(before, after)?; + jobs.run(output, output_file.as_deref())?; + Ok(()) + } + } + } } /////////// @@ -228,8 +430,11 @@ impl App { /// /// Within the `stdin.txt` command line, there are several special symbols: /// -/// * $INPUT - Resolves to `input.json` in the same directory as the `stdin.txt` file. -/// * $OUTPUT - Resolves to `output.json` in a temporary directory. +/// * $INPUT_FILE - Resolves to `input.json` in the same directory as the `stdin.txt` file. +/// * $OUTPUT_FILE - Resolves to `output.json` in a temporary directory. +/// * $TOLERANCES_FILE - Resolves to `tolerances.json` in the test directory. +/// * $REGRESSION_INPUT_FILE - Resolves to `regression_input.json` test directory. +/// * $CHECK_OUTPUT_FILE - Resolves to `checks.json` in a temporary directory. /// /// As mentioned - an input JSON file can be included and must be named "input.json" to be /// discoverable. @@ -237,7 +442,7 @@ impl App { /// ## Output Files /// /// Tests should have at least a `stdout.txt` file with the expected outputs for running the -/// command in `stdin.txt`. If an output JSON file is expected, it should be name `output.json`. +/// command in `stdin.txt`. If an output JSON file is expected, it should be named `output.json`. /// /// ## Test Discovery and Running /// @@ -277,6 +482,13 @@ mod tests { const INPUT_FILE: &str = "input.json"; const OUTPUT_FILE: &str = "output.json"; + // Regression Extension + const TOLERANCES_FILE: &str = "tolerances.json"; + const REGRESSION_INPUT_FILE: &str = "regression_input.json"; + const CHECK_OUTPUT_FILE: &str = "checks.json"; + + const ALL_GENERATED_OUTPUTS: [&str; 2] = [OUTPUT_FILE, CHECK_OUTPUT_FILE]; + // Read the entire contents of a file to a string. fn read_to_string>(path: P, ctx: &str) -> String { match std::fs::read_to_string(path.as_ref()) { @@ -327,33 +539,57 @@ mod tests { } } - fn parse_stdin(&self, tempdir: &Path) -> App { + fn parse_stdin(&self, tempdir: &Path) -> Vec { let path = self.dir.join(STDIN); // Read the standard input file to a string. let stdin = read_to_string(&path, "standard input"); - let args: Vec = stdin + let output: Vec = stdin + .lines() + .filter_map(|line| { + if line.starts_with('#') || line.is_empty() { + None + } else { + Some(self.parse_line(line, tempdir)) + } + }) + .collect(); + + if output.is_empty() { + panic!("File \"{}/stdin.txt\" has no command!", self.dir.display()); + } + + output + } + + fn parse_line(&self, line: &str, tempdir: &Path) -> App { + // Split and resolve special symbols + let args: Vec = line .split_whitespace() .map(|v| -> OsString { self.resolve(v, tempdir).into() }) .collect(); - // Split and resolve special symbols App::try_parse_from(std::iter::once(OsString::from("test-app")).chain(args)).unwrap() } fn resolve(&self, s: &str, tempdir: &Path) -> PathBuf { - if s == "$INPUT" { - self.dir.join(INPUT_FILE) - } else if s == "$OUTPUT" { - tempdir.join(OUTPUT_FILE) - } else { - s.into() + match s { + // Standard workflow + "$INPUT" => self.dir.join(INPUT_FILE), + "$OUTPUT" => tempdir.join(OUTPUT_FILE), + // Regression extension + "$TOLERANCES" => self.dir.join(TOLERANCES_FILE), + "$REGRESSION_INPUT" => self.dir.join(REGRESSION_INPUT_FILE), + "$CHECK_OUTPUT" => tempdir.join(CHECK_OUTPUT_FILE), + + // Catch-all: no interpolation + _ => s.into(), } } fn run(&self, tempdir: &Path) { - let app = self.parse_stdin(tempdir); + let apps = self.parse_stdin(tempdir); // Register inputs let mut inputs = registry::Inputs::new(); @@ -363,19 +599,42 @@ mod tests { let mut benchmarks = registry::Benchmarks::new(); crate::test::register_benchmarks(&mut benchmarks); - // Run app - collecting output into a buffer. + // Run each app invocation - collecting the last output into a buffer. // - // If the app returns an error - format the error to the output buffer as well - // using the debug formatting option. + // Only the last run is allowed to return an error - if it does, format the + // error to the output buffer as well using the debug formatting option. let mut buffer = crate::output::Memory::new(); - if let Err(err) = app.run(&inputs, &benchmarks, &mut buffer) { - let mut b: &mut dyn crate::Output = &mut buffer; - write!(b, "{:?}", err).unwrap(); + for (i, app) in apps.iter().enumerate() { + let is_last = i + 1 == apps.len(); + + // Select where to route the test output. + // + // Only the last run gets saved. Setup output is discarded — if a setup + // command fails, the panic message includes the error. + let mut b: &mut dyn crate::Output = if is_last { + &mut buffer + } else { + &mut crate::output::Sink::new() + }; + + if let Err(err) = app.run(&inputs, &benchmarks, b) { + if is_last { + write!(b, "{:?}", err).unwrap(); + } else { + panic!( + "App {} of {} failed with error: {:?}", + i + 1, + apps.len(), + err + ); + } + } } // Check that `stdout` matches let stdout: String = ux::normalize(ux::strip_backtrace(buffer.into_inner().try_into().unwrap())); + let stdout = ux::scrub_path(stdout, tempdir, "$TEMPDIR"); let output = self.dir.join(STDOUT); if self.overwrite { std::fs::write(output, stdout).unwrap(); @@ -387,60 +646,62 @@ mod tests { } // Check that the output files match. - let output_path = tempdir.join(OUTPUT_FILE); - let was_output_generated = output_path.is_file(); + for file in ALL_GENERATED_OUTPUTS { + self.check_output_file(tempdir, file); + } + } - let expected_output_path = self.dir.join(OUTPUT_FILE); - let is_output_expected = expected_output_path.is_file(); + fn check_output_file(&self, tempdir: &Path, filename: &str) { + let generated_path = tempdir.join(filename); + let was_generated = generated_path.is_file(); + + let expected_path = self.dir.join(filename); + let is_expected = expected_path.is_file(); if self.overwrite { // Copy the output file to the destination. - if was_output_generated { + if was_generated { println!( - "Moving generated output file {:?} to {:?}", - output_path, expected_output_path + "Moving generated file {:?} to {:?}", + generated_path, expected_path ); - if let Err(err) = std::fs::rename(&output_path, &expected_output_path) { + if let Err(err) = std::fs::rename(&generated_path, &expected_path) { panic!( - "Moving generated output file {:?} to expected location {:?} failed: {}", - output_path, expected_output_path, err + "Moving generated file {:?} to expected location {:?} failed: {}", + generated_path, expected_path, err ); } - } else if is_output_expected { - println!("Removing outdated output file {:?}", expected_output_path); - if let Err(err) = std::fs::remove_file(&expected_output_path) { - panic!( - "Failed removing outdated output file {:?}: {}", - expected_output_path, err - ); + } else if is_expected { + println!("Removing outdated file {:?}", expected_path); + if let Err(err) = std::fs::remove_file(&expected_path) { + panic!("Failed removing outdated file {:?}: {}", expected_path, err); } } } else { - match (was_output_generated, is_output_expected) { + match (was_generated, is_expected) { (true, true) => { - let output_contents = read_to_string(output_path, "generated output JSON"); + let output_contents = read_to_string(generated_path, "generated"); - let expected_contents = - read_to_string(expected_output_path, "expected output JSON"); + let expected_contents = read_to_string(expected_path, "expected"); if output_contents != expected_contents { panic!( - "Got:\n\n{}\n\nExpected:\n\n{}\n", - output_contents, expected_contents + "{}: Got:\n\n{}\n\nExpected:\n\n{}\n", + filename, output_contents, expected_contents ); } } (true, false) => { - let output_contents = read_to_string(output_path, "generated output JSON"); + let output_contents = read_to_string(generated_path, "generated"); panic!( - "An output JSON was generated when none was expected. Contents:\n\n{}", - output_contents + "{} was generated when none was expected. Contents:\n\n{}", + filename, output_contents ); } (false, true) => { - panic!("No output JSON was generated when one was expected"); + panic!("{} was not generated when it was expected", filename); } (false, false) => { /* this is okay */ } } @@ -469,7 +730,12 @@ mod tests { } #[test] - fn top_level_tests() { - run_all_tests_in(""); + fn benchmark_tests() { + run_all_tests_in("benchmark"); + } + + #[test] + fn regression_tests() { + run_all_tests_in("regression"); } } diff --git a/diskann-benchmark-runner/src/benchmark.rs b/diskann-benchmark-runner/src/benchmark.rs index 30eb28de3..6c196d8af 100644 --- a/diskann-benchmark-runner/src/benchmark.rs +++ b/diskann-benchmark-runner/src/benchmark.rs @@ -3,7 +3,7 @@ * Licensed under the MIT license. */ -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::{ dispatcher::{FailureScore, MatchScore}, @@ -58,83 +58,330 @@ pub trait Benchmark { ) -> anyhow::Result; } +/// A refinement of [`Benchmark`], that supports before/after comparison of generated results. +/// +/// Benchmarks are associated with a "tolerance" input, which may contain runtime values +/// controlling the amount of slack a benchmark is allowed to have between runs before failing. +/// +/// The semantics of pass or failure are left solely to the discretion of the [`Regression`] +/// implementation. +/// +/// See: [`register_regression`](crate::registry::Benchmarks::register_regression). +pub trait Regression: Benchmark Deserialize<'a>> { + /// The tolerance [`Input`] associated with this regression check. + type Tolerances: Input + 'static; + + /// The report summary used to describe a successful regression check. + type Pass: Serialize + std::fmt::Display + 'static; + + /// The report summary used to describe an unsuccessful regression check. + type Fail: Serialize + std::fmt::Display + 'static; + + /// Run any regression checks necessary for two benchmark runs `before` and `after`. + /// Argument `tolerances` contain any tuned runtime tolerances to use when determining + /// whether or not a regression is detected. + /// + /// The `input` is the raw input that would have been provided to [`Benchmark::run`] + /// when generating the `before` and `after` outputs. + /// + /// Implementations of `check` should not attempt to print to `stdout` or any other + /// stream. Instead, all diagnostics should be encoded in the returned [`PassFail`] type + /// for reporting upstream. + fn check( + tolerances: &Self::Tolerances, + input: &Self::Input, + before: &Self::Output, + after: &Self::Output, + ) -> anyhow::Result>; +} + +/// Describe whether or not a [`Regression`] passed or failed. +#[derive(Debug, Clone, Copy)] +pub enum PassFail { + Pass(P), + Fail(F), +} + ////////////// // Internal // ////////////// -/// Object-safe trait for type-erased benchmarks stored in the registry. -pub(crate) trait DynBenchmark { - fn try_match(&self, input: &Any) -> Result; +pub(crate) mod internal { + use super::*; - fn description(&self, f: &mut std::fmt::Formatter<'_>, input: Option<&Any>) - -> std::fmt::Result; + use std::marker::PhantomData; - fn run( - &self, - input: &Any, - checkpoint: Checkpoint<'_>, - output: &mut dyn Output, - ) -> anyhow::Result; -} + use anyhow::Context; + use thiserror::Error; -#[derive(Debug, Clone, Copy)] -pub(crate) struct Wrapper(std::marker::PhantomData); + /// Object-safe trait for type-erased benchmarks stored in the registry. + pub(crate) trait Benchmark { + fn try_match(&self, input: &Any) -> Result; + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&Any>, + ) -> std::fmt::Result; + + fn run( + &self, + input: &Any, + checkpoint: Checkpoint<'_>, + output: &mut dyn Output, + ) -> anyhow::Result; -impl Wrapper { - pub(crate) fn new() -> Self { - Self(std::marker::PhantomData) + /// If supported, return an object capable of running regression checks on this benchmark. + fn as_regression(&self) -> Option<&dyn Regression>; } -} -/// The score given to unsuccessful downcasts in [`DynBenchmark::try_match`]. -const MATCH_FAIL: FailureScore = FailureScore(10_000); + pub(crate) struct Checked { + pub(crate) json: serde_json::Value, + pub(crate) display: Box, + } -impl DynBenchmark for Wrapper -where - T: Benchmark, -{ - fn try_match(&self, input: &Any) -> Result { - if let Some(cast) = input.downcast_ref::() { - T::try_match(cast) - } else { - Err(MATCH_FAIL) + impl Checked { + /// Serialize `value` to `serde_json::Value` and box it for future display. + fn new(value: T) -> Result + where + T: Serialize + std::fmt::Display + 'static, + { + Ok(Self { + json: serde_json::to_value(&value)?, + display: Box::new(value), + }) } } - fn description( - &self, - f: &mut std::fmt::Formatter<'_>, - input: Option<&Any>, - ) -> std::fmt::Result { - match input { - Some(input) => match input.downcast_ref::() { - Some(cast) => T::description(f, Some(cast)), - None => write!( - f, - "expected tag \"{}\" - instead got \"{}\"", - T::Input::tag(), - input.tag(), - ), - }, - None => { - writeln!(f, "tag \"{}\"", ::tag())?; - T::description(f, None) + pub(crate) type CheckedPassFail = PassFail; + + pub(crate) trait Regression { + fn tolerance(&self) -> &dyn crate::input::DynInput; + fn input_tag(&self) -> &'static str; + fn check( + &self, + tolerances: &Any, + input: &Any, + before: &serde_json::Value, + after: &serde_json::Value, + ) -> anyhow::Result; + } + + impl std::fmt::Debug for dyn Regression + '_ { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("dyn Regression") + .field("tolerance", &self.tolerance().tag()) + .field("input_tag", &self.input_tag()) + .finish() + } + } + + pub(crate) trait AsRegression { + fn as_regression(&self) -> Option<&dyn Regression>; + } + + #[derive(Debug, Clone)] + pub(crate) struct NoRegression; + + impl AsRegression for NoRegression { + fn as_regression(&self) -> Option<&dyn Regression> { + None + } + } + + #[derive(Debug, Clone, Copy)] + pub(crate) struct WithRegression(PhantomData); + + impl WithRegression { + pub(crate) const fn new() -> Self { + Self(PhantomData) + } + } + + impl AsRegression for WithRegression + where + T: super::Regression, + { + fn as_regression(&self) -> Option<&dyn Regression> { + Some(self) + } + } + + impl Regression for WithRegression + where + T: super::Regression, + { + fn tolerance(&self) -> &dyn crate::input::DynInput { + &crate::input::Wrapper::::INSTANCE + } + + fn input_tag(&self) -> &'static str { + T::Input::tag() + } + + fn check( + &self, + tolerance: &Any, + input: &Any, + before: &serde_json::Value, + after: &serde_json::Value, + ) -> anyhow::Result { + let tolerance = tolerance + .downcast_ref::() + .ok_or_else(|| BadDownCast::new(T::Tolerances::tag(), tolerance.tag())) + .context("failed to obtain tolerance")?; + + let input = input + .downcast_ref::() + .ok_or_else(|| BadDownCast::new(T::Input::tag(), input.tag())) + .context("failed to obtain input")?; + + let before = T::Output::deserialize(before) + .map_err(|err| DeserializationError::new(Kind::Before, err))?; + + let after = T::Output::deserialize(after) + .map_err(|err| DeserializationError::new(Kind::After, err))?; + + let passfail = match T::check(tolerance, input, &before, &after)? { + PassFail::Pass(pass) => PassFail::Pass(Checked::new(pass)?), + PassFail::Fail(fail) => PassFail::Fail(Checked::new(fail)?), + }; + + Ok(passfail) + } + } + + #[derive(Debug, Clone, Copy)] + pub(crate) struct Wrapper { + regression: R, + _type: PhantomData, + } + + impl Wrapper { + pub(crate) const fn new() -> Self { + Self::new_with(NoRegression) + } + } + + impl Wrapper { + pub(crate) const fn new_with(regression: R) -> Self { + Self { + regression, + _type: PhantomData, } } } - fn run( - &self, - input: &Any, - checkpoint: Checkpoint<'_>, - output: &mut dyn Output, - ) -> anyhow::Result { - match input.downcast_ref::() { - Some(input) => { - let result = T::run(input, checkpoint, output)?; - Ok(serde_json::to_value(result)?) + /// The score given to unsuccessful downcasts in [`Benchmark::try_match`]. + const MATCH_FAIL: FailureScore = FailureScore(10_000); + + impl Benchmark for Wrapper + where + T: super::Benchmark, + R: AsRegression, + { + fn try_match(&self, input: &Any) -> Result { + if let Some(cast) = input.downcast_ref::() { + T::try_match(cast) + } else { + Err(MATCH_FAIL) + } + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&Any>, + ) -> std::fmt::Result { + match input { + Some(input) => match input.downcast_ref::() { + Some(cast) => T::description(f, Some(cast)), + None => write!( + f, + "expected tag \"{}\" - instead got \"{}\"", + T::Input::tag(), + input.tag(), + ), + }, + None => { + writeln!(f, "tag \"{}\"", ::tag())?; + T::description(f, None) + } } - None => Err(anyhow::anyhow!("INTERNAL ERROR: invalid downcast!")), + } + + fn run( + &self, + input: &Any, + checkpoint: Checkpoint<'_>, + output: &mut dyn Output, + ) -> anyhow::Result { + match input.downcast_ref::() { + Some(input) => { + let result = T::run(input, checkpoint, output)?; + Ok(serde_json::to_value(result)?) + } + None => Err(BadDownCast::new(T::Input::tag(), input.tag()).into()), + } + } + + // Extensions + fn as_regression(&self) -> Option<&dyn Regression> { + self.regression.as_regression() + } + } + + //--------// + // Errors // + //--------// + + #[derive(Debug, Clone, Copy, Error)] + #[error( + "INTERNAL ERROR: bad downcast - expected \"{}\" but got \"{}\"", + self.expected, + self.got + )] + struct BadDownCast { + expected: &'static str, + got: &'static str, + } + + impl BadDownCast { + fn new(expected: &'static str, got: &'static str) -> Self { + Self { expected, got } + } + } + + #[derive(Debug, Error)] + #[error( + "the \"{}\" results do not match the output schema expected by this benchmark", + self.kind + )] + struct DeserializationError { + kind: Kind, + source: serde_json::Error, + } + + impl DeserializationError { + fn new(kind: Kind, source: serde_json::Error) -> Self { + Self { kind, source } + } + } + + #[derive(Debug, Clone, Copy)] + enum Kind { + Before, + After, + } + + impl std::fmt::Display for Kind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let as_str = match self { + Self::Before => "before", + Self::After => "after", + }; + + write!(f, "{}", as_str) } } } diff --git a/diskann-benchmark-runner/src/checker.rs b/diskann-benchmark-runner/src/checker.rs index c2dcaeb65..4b3dda556 100644 --- a/diskann-benchmark-runner/src/checker.rs +++ b/diskann-benchmark-runner/src/checker.rs @@ -54,6 +54,10 @@ impl Checker { T: serde::Serialize + CheckDeserialization + std::fmt::Debug + 'static, { value.check_deserialization(self)?; + #[expect( + clippy::expect_used, + reason = "crate infrastructure ensures an untagged Checker is not leaked" + )] Ok(Any::new(value, self.tag.expect("tag must be set"))) } diff --git a/diskann-benchmark-runner/src/input.rs b/diskann-benchmark-runner/src/input.rs index 5f4a0dc80..4f8b1523e 100644 --- a/diskann-benchmark-runner/src/input.rs +++ b/diskann-benchmark-runner/src/input.rs @@ -87,7 +87,9 @@ impl std::fmt::Debug for Registered<'_> { pub(crate) struct Wrapper(std::marker::PhantomData); impl Wrapper { - pub(crate) fn new() -> Self { + pub(crate) const INSTANCE: Self = Self::new(); + + pub(crate) const fn new() -> Self { Self(std::marker::PhantomData) } } diff --git a/diskann-benchmark-runner/src/internal/mod.rs b/diskann-benchmark-runner/src/internal/mod.rs new file mode 100644 index 000000000..5cc494114 --- /dev/null +++ b/diskann-benchmark-runner/src/internal/mod.rs @@ -0,0 +1,20 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use anyhow::Context; + +pub(crate) mod regression; + +/// Attempt to load and deserialize from a JSON file on disk identified with `path`. +pub(crate) fn load_from_disk(path: &std::path::Path) -> anyhow::Result +where + T: for<'a> serde::Deserialize<'a>, +{ + let file = std::fs::File::open(path) + .with_context(|| format!("while trying to open {}", path.display()))?; + + let reader = std::io::BufReader::new(file); + Ok(serde_json::from_reader(reader)?) +} diff --git a/diskann-benchmark-runner/src/internal/regression.rs b/diskann-benchmark-runner/src/internal/regression.rs new file mode 100644 index 000000000..f9bc12061 --- /dev/null +++ b/diskann-benchmark-runner/src/internal/regression.rs @@ -0,0 +1,860 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! This module contains the tolerance parsing, matching, and running logic. +//! +//! ## Loading/Parsing/Matching +//! +//! There are a whole host of things that can go wrong during this process, and we're +//! obligated to provide at least somewhat reasonable error messages at each stage. +//! +//! The logic here follows process of continually refining the state of parsed inputs, +//! tolerances, and benchmarks. This is outlined as follows: +//! +//! 1. Deserialize a raw tolerances JSON file into [`Raw`] via [`Raw::load`]. This parses +//! the tolerance skeleton into a sequence of [`RawInner`]. +//! +//! At this stage, we've: +//! * Verified the structure of the tolerances file. +//! +//! 2. Parse each [`RawInner`] into a [`ParsedInner`] via [`Raw::parse`]. +//! This attempts to match each [`RawInner`] with a [`registry::RegisteredTolerance`] +//! and uses said regression to attempt to parse the raw value into a concrete type. +//! +//! After this stage, each [`ParsedInner`] has the following invariants: +//! * The tolerance input has been properly deserialized into a concrete struct. +//! * It has been matched with a [`registry::RegisteredTolerance`], which contains the +//! collection of inputs and benchmarks that compatible with the parsed tolerance. +//! * Verified that the `input` associated with the tolerance has a proper association +//! in the registry. +//! +//! 3. Convert each [`ParsedInner`] into a [`Check`]. This works by matching the raw input +//! associated with each [`ParsedInner`] to an actual registered input, and then finding +//! the registered benchmark this is the best match for the input. +//! +//! For ergonomics, we allow an "input/tolerance" pair to match multiple positional +//! "inputs" in the input JSON. A "tolerance input" matches with an "actual input" if its +//! raw JSON passes [`is_subset`] of the actual input's raw JSON. At this step, we need +//! to work on raw JSON because a parsed input will have deserialization checks run and +//! can thus look different. +//! +//! However, matching only succeeds if the above process is complete and unambiguous: +//! 1. Each "input/tolerance" pair gets matched with at least one "actual input". +//! 2. All "actual inputs" have exactly one "input/tolerance" pair that matches them. +//! +//! At this step, we have the invariants: +//! * The tolerance is parsed to a concrete type. +//! * Its associated input has been verified to be consistent with the registry and has +//! been unambiguously selected from the "actual inputs". +//! * The selected "actual input" has then been successfully matched with a valid +//! regression benchmark using the normal matching flow. +//! +//! 4. Finally, [`Checks`] gets converted into [`Jobs`]. During this process, we also verify +//! the structure of the before/after JSON files and ensure that the number of results mostly +//! lines up. At this stage, each [`Job`] has the invariants associated with a [`Check`] +//! with the addition: +//! +//! * We've been paired with raw before/after JSON that we expect to have the dynamic type +//! of the output of the associated [`registry::RegisteredBenchmark`]. +//! +//! This gets verified during the actual check runs. +//! +//! The entry points here are: +//! +//! * [`Checks::new`]: Do everything up to step 3. This enables preflight validation checks. +//! * [`Checks::jobs`]: Perform step 4. This prepares us to run all the checks. +//! +//! ## Running Checks +//! +//! Running checks simply involves running each [`Job`] and aggregating the results. +//! Each executed job can end up in one of three states: +//! +//! * Success (yay). +//! * Graceful Failure: Everything looked right in terms of deserialization, but the actual +//! check failed. +//! * Error: Something went wrong. This could either be because the output JSON could not be +//! deserialized properly, or for another critical reason. +//! +//! To provide better diagnostics, we wait until all checks have run before beginning a report. +//! The report is triaged in reverse order: +//! +//! * If any check fails with an error, we report all such errors and propagate an error to the top. +//! * If any check gracefully fails, report all such failures and propagate an error to the top. +//! * Otherwise, report the diagnostics from all successes and propagate `Ok(())`. +//! +//! The entry point here is: +//! +//! * [`Jobs::run`]: Run each job, prepare the report, and return a meaningful `Result`. +//! +//! ## Testing +//! +//! Testing is largely facilitated by the crate level UX framework. + +use std::{collections::HashMap, io::Write, path::Path, rc::Rc}; + +use anyhow::Context; +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::{ + benchmark::{internal::CheckedPassFail, PassFail}, + internal::load_from_disk, + jobs, registry, result, Any, Checker, +}; + +//////////// +// Checks // +//////////// + +/// See module level documentation for invariants. +/// +/// A `tolerance` can be mapped to multiple inputs and is thus shared behind an [`Rc`]. +struct Check<'a> { + regression: registry::RegressionBenchmark<'a>, + tolerance: Rc, + input: Any, +} + +/// See module level documentation for invariants. +pub(crate) struct Checks<'a> { + checks: Vec>, +} + +impl<'a> Checks<'a> { + pub(crate) fn new( + tolerances: &Path, + input_file: &Path, + inputs: ®istry::Inputs, + entries: &'a HashMap<&'static str, registry::RegisteredTolerance<'a>>, + ) -> anyhow::Result { + // Load the raw input file. + let partial = jobs::Partial::load(input_file)?; + + // Parse and validate the raw jobs against the registered inputs. + // + // This preserves the ordering of the jobs. + let inputs = jobs::Jobs::parse(&partial, inputs)?; + + // Now that the inputs have been fully parsed and validated, we then check that we + // can load the raw tolerance file. + let parsed = Raw::load(tolerances)?.parse(entries)?; + Self::match_all(parsed, partial, inputs) + } + + pub(crate) fn jobs(self, before: &Path, after: &Path) -> anyhow::Result> { + let (before_path, after_path) = (before, after); + + let before = result::RawResult::load(before_path)?; + let after = result::RawResult::load(after_path)?; + + let expected = self.checks.len(); + anyhow::ensure!( + before.len() == expected, + "\"before\" file \"{}\" has {} entries but expected {}", + before_path.display(), + before.len(), + expected, + ); + + anyhow::ensure!( + after.len() == expected, + "\"after\" file \"{}\" has {} entries but expected {}", + after_path.display(), + after.len(), + expected, + ); + + // At this point, `before` and `after` have been deserialized (though not parsed) + // and we know that the lengths of everything are consistent. We can finally + // formulate the final list of jobs. + let jobs = std::iter::zip(self.checks, std::iter::zip(before, after)) + .map(|(check, (before, after))| { + let Check { + regression, + tolerance, + input, + } = check; + Job { + regression, + tolerance, + input, + before, + after, + } + }) + .collect(); + + Ok(Jobs { jobs }) + } + + fn match_all( + parsed: Parsed<'a>, + partial: jobs::Partial, + inputs: jobs::Jobs, + ) -> anyhow::Result { + debug_assert_eq!( + partial.jobs().len(), + inputs.jobs().len(), + "expected \"inputs\" to be the parsed representation of \"partial\"" + ); + + // Map each `ParsedInner` entry to all `partial` inputs they map to. + // + // Each `ParsedInner` unfortunately needs to get compared with every `partial` so we can + // detect overlapping matches and reject them. + let mut parsed_to_input: Vec> = vec![Vec::default(); parsed.inner.len()]; + let mut input_to_parsed: Vec> = vec![Vec::default(); inputs.jobs().len()]; + + parsed.inner.iter().enumerate().for_each(|(i, t)| { + partial.jobs().iter().enumerate().for_each(|(j, raw)| { + if raw.tag == t.input.tag && is_subset(&raw.content, &t.input.content) { + parsed_to_input[i].push(j); + input_to_parsed[j].push(i); + } + }) + }); + + // Validate the whole matching process. + let input_to_parsed = check_matches(parsed_to_input, input_to_parsed)?; + + // At this point: + // + // - `parsed` is known to contain parsed tolerances. + // - `inputs` is known to contain parsed benchmark inputs. + // - We've verified that all the parsed tolerances unambiguously match with a + // tolerance input. + // + // We can now package everything together! + debug_assert_eq!(input_to_parsed.len(), inputs.jobs().len()); + + let checks = std::iter::zip(inputs.into_inner(), input_to_parsed.into_iter()) + .map(|(input, index)| { + // This index should always be inbounds. + let inner = &parsed.inner[index]; + assert_eq!(inner.input.tag, input.tag()); + + // Within the parsed tolerance, we should be able to find the best-matching + // regression benchmark for this concrete input. This benchmark should exist, + // but it's possible that code changes between when the results were generated + // and now has led to the input no longer being matchable with anything. + let regression = inner + .entry + .regressions + .iter() + .filter_map(|r| r.try_match(&input).ok().map(|score| (*r, score))) + .min_by_key(|(_, score)| *score) + .map(|(r, _)| r) + .ok_or_else(|| { + anyhow::anyhow!( + "Could not match input tag \"{}\" and tolerance tag \"{}\" to \ + a valid benchmark. This likely means file or code changes \ + between when the input file was last used. If the normal \ + benchmark flow succeeds, please report this issue.", + inner.input.tag, + inner.tolerance.tag(), + ) + })?; + + Ok(Check { + regression, + tolerance: inner.tolerance.clone(), + input, + }) + }) + .collect::>>()?; + + Ok(Self { checks }) + } +} + +//---------// +// Helpers // +//---------// + +/// A raw unprocessed tolerance job. +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct RawInner { + input: jobs::Unprocessed, + tolerance: jobs::Unprocessed, +} + +impl RawInner { + pub(crate) fn new(input: jobs::Unprocessed, tolerance: jobs::Unprocessed) -> Self { + Self { input, tolerance } + } +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub(crate) struct Raw { + checks: Vec, +} + +impl Raw { + pub(crate) fn load(path: &Path) -> anyhow::Result { + load_from_disk(path) + } + + fn parse<'a>( + self, + entries: &'a HashMap<&'static str, registry::RegisteredTolerance<'a>>, + ) -> anyhow::Result> { + // Attempt to parse raw tolerances into registered tolerance inputs. + let num_checks = self.checks.len(); + let mut checker = Checker::new(vec![], None); + let inner = self + .checks + .into_iter() + .enumerate() + .map(|(i, unprocessed)| { + let context = || { + format!( + "while processing tolerance input {} of {}", + i.wrapping_add(1), + num_checks, + ) + }; + + // Does this tolerance tag matched a registered tolerance? + let entry = entries + .get(&*unprocessed.tolerance.tag) + .ok_or_else(|| { + anyhow::anyhow!( + "Unrecognized tolerance tag: \"{}\"", + unprocessed.tolerance.tag + ) + }) + .with_context(context)?; + + // Verify that the accompanying input tag is accepted by at least one + // benchmark registered under this tolerance. + if !entry + .regressions + .iter() + .any(|r| r.input_tag() == unprocessed.input.tag) + { + let valid: Vec<_> = entry + .regressions + .iter() + .map(|pair| pair.input_tag()) + .collect(); + return Err(anyhow::anyhow!( + "input tag \"{}\" is not compatible with tolerance tag \"{}\". \ + Valid input tags are: {:?}", + unprocessed.input.tag, + unprocessed.tolerance.tag, + valid, + )) + .with_context(context); + } + + checker.set_tag(entry.tolerance.tag()); + let tolerance = entry + .tolerance + .try_deserialize(&unprocessed.tolerance.content, &mut checker) + .with_context(context)?; + + Ok(ParsedInner { + entry, + tolerance: Rc::new(tolerance), + input: unprocessed.input, + }) + }) + .collect::>()?; + + Ok(Parsed { inner }) + } + + pub(crate) fn example() -> String { + #[expect( + clippy::expect_used, + reason = "we control the concrete struct and its serialization implementation" + )] + serde_json::to_string_pretty(&Self::default()) + .expect("built-in serialization should succeed") + } +} + +/// Invariants: +/// +/// * `tolerance` is parsed to the dynamic type of the associated tolerance in `entry`. +/// * The tag in `input` exists within at least one of the regressions in `entry`. +#[derive(Debug)] +struct ParsedInner<'a> { + entry: &'a registry::RegisteredTolerance<'a>, + tolerance: Rc, + input: jobs::Unprocessed, +} + +#[derive(Debug)] +struct Parsed<'a> { + inner: Vec>, +} + +/// Return `true` only `needle` is a structural subset of `haystack`. This is defined as: +/// +/// 1. All flattened paths of `needle` are flattened paths of `haystack`. +/// 2. The values at the end of all flattened paths are equal. +/// +/// When matching arrays, `needle` is matched as a potential prefix of the corresponding +/// entry in `haystack`. +#[must_use] +pub(crate) fn is_subset(mut haystack: &Value, mut needle: &Value) -> bool { + macro_rules! false_if { + ($expr:expr) => { + if $expr { + return false; + } + }; + } + + // Note that we use a `do-while` style loop to short-circuit situations where we + // match/mismatch immediately, saving an allocation. + // + // If we exit on the first iteration, the vector stays empty and thus doesn't allocate. + let mut stack = Vec::new(); + loop { + match (haystack, needle) { + (Value::Null, Value::Null) => { + // Null always matches + } + (Value::Bool(h), Value::Bool(n)) => false_if!(h != n), + (Value::Number(h), Value::Number(n)) => false_if!(h != n), + (Value::String(h), Value::String(n)) => false_if!(h != n), + (Value::Array(h), Value::Array(n)) => { + // If `n` is longer, then it cannot possibly be a subset of `h`. + // On the flip side, if `n` is shorter, then we can at least try to match + // the prefix. + false_if!(h.len() < n.len()); + std::iter::zip(h.iter(), n.iter()).for_each(|(h, n)| stack.push((h, n))); + } + (Value::Object(h), Value::Object(n)) => { + for (k, v) in n.iter() { + match h.get(k) { + Some(h) => stack.push((h, v)), + None => return false, + } + } + } + // If the two enums are not the same, then we have a fundamental mismatch. + _ => return false, + } + + if let Some((h, n)) = stack.pop() { + (haystack, needle) = (h, n); + } else { + break; + } + } + + true +} + +/// A single problem detected during bipartite tolerance-to-input matching. +#[derive(Debug, PartialEq)] +enum MatchProblem { + /// Tolerance at this index matched no inputs. + OrphanedTolerance(usize), + /// Input at this index matched no tolerances. + UncoveredInput(usize), + /// Input at this index matched multiple tolerances. + AmbiguousInput(usize, Vec), +} + +/// Error returned when the bipartite matching between tolerance entries and inputs is +/// invalid. +#[derive(Debug)] +struct AmbiguousMatch(Vec); + +impl std::fmt::Display for AmbiguousMatch { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "tolerance matching failed:")?; + for problem in &self.0 { + match problem { + MatchProblem::OrphanedTolerance(i) => { + write!(f, "\n tolerance {} matched no inputs", i + 1)?; + } + MatchProblem::UncoveredInput(i) => { + write!(f, "\n input {} matched no tolerances", i + 1)?; + } + MatchProblem::AmbiguousInput(i, tolerances) => { + write!(f, "\n input {} matched tolerances ", i + 1)?; + for (j, &t) in tolerances.iter().enumerate() { + if j > 0 { + write!(f, ", ")?; + } + write!(f, "{}", t + 1)?; + } + } + } + } + Ok(()) + } +} + +impl std::error::Error for AmbiguousMatch {} + +/// Validate that every entry in `parsed_to_input` has at least one match and that all +/// entries in `input_to_parsed` have exactly one match. +/// +/// Return unique matches from `input_to_parsed` on success. Otherwise, return a +/// descriptive error. +fn check_matches( + parsed_to_input: Vec>, + input_to_parsed: Vec>, +) -> Result, AmbiguousMatch> { + let mut problems = Vec::new(); + + for (i, matches) in parsed_to_input.iter().enumerate() { + if matches.is_empty() { + problems.push(MatchProblem::OrphanedTolerance(i)); + } + } + + let mut result = Vec::with_capacity(input_to_parsed.len()); + for (i, matches) in input_to_parsed.into_iter().enumerate() { + match matches.len() { + 0 => problems.push(MatchProblem::UncoveredInput(i)), + 1 => result.push(matches[0]), + _ => problems.push(MatchProblem::AmbiguousInput(i, matches)), + } + } + + if problems.is_empty() { + Ok(result) + } else { + Err(AmbiguousMatch(problems)) + } +} + +////////// +// Jobs // +////////// + +/// A fully parsed and (hopefully) ready to run regression check. +#[derive(Debug)] +pub(crate) struct Job<'a> { + /// The executor for the actual check we wish to run. + regression: registry::RegressionBenchmark<'a>, + + /// The [`crate::benchmark::Regression::Tolerance`] associated with `regression`. + tolerance: Rc, + + /// The [`crate::Benchmark::Input`] associated with `benchmark`. + input: Any, + + /// The [`result::RawResult`] from the "before" comparison. + /// + /// Payload should be deserializable to [`crate::Benchmark::Output`]. + before: result::RawResult, + + /// The [`result::RawResult`] from the "after" comparison. + /// + /// Payload should be deserializable to [`crate::Benchmark::Output`]. + after: result::RawResult, +} + +impl Job<'_> { + /// Actually run the jobs. + /// + /// As long as the chain of custody throughout this module is correct, at least the + /// `tolerance` and `input` fields should match the associated regression capable + /// `benchmark`. + /// + /// The associated outputs may still fail to deserialize properly and the check could + /// still fail. This is why the [`Jobs`] struct aggregates together all results before + /// deciding how they should be displayed. + fn run(&self) -> anyhow::Result { + self.regression.check( + &self.tolerance, + &self.input, + &self.before.results, + &self.after.results, + ) + } +} + +#[derive(Debug)] +pub(crate) struct Jobs<'a> { + jobs: Vec>, +} + +impl Jobs<'_> { + /// Run regression checks by comparing before/after output files against the matched + /// tolerances. + /// + /// The priority cascade for terminal output is: + /// + /// 1. If any checks produce an infrastructure error, report **all** errors and return + /// `Err`. Pass/fail results are suppressed so errors stay front-and-center. + /// 2. Otherwise, if any checks fail, report **all** failures and return `Err`. + /// Successes are suppressed for the same reason. + /// 3. Otherwise, all checks passed — report them and return `Ok`. + /// + /// The JSON output (if `output_file` is provided) is always written regardless of + /// outcome, so downstream tooling can inspect all results. + /// + /// TODO: We could consider a `--verbose` flag to record all outcomes regardless of + /// priority, but for now the hierarchy of reporting seems the most pragmatic. + pub(crate) fn run( + &self, + mut output: &mut dyn crate::output::Output, + output_file: Option<&Path>, + ) -> anyhow::Result<()> { + // Step 1: Run all checks, collecting results. + let results: Vec<_> = self.jobs.iter().map(|job| job.run()).collect(); + + // Step 2: Build the JSON output array (always, even on errors). + let check_outputs: Vec> = std::iter::zip(self.jobs.iter(), results.iter()) + .map(|(job, result)| -> anyhow::Result<_> { + let tolerance = job.tolerance.serialize()?; + let o = match result { + Ok(PassFail::Pass(checked)) => CheckOutput::pass(tolerance, &checked.json), + Ok(PassFail::Fail(checked)) => CheckOutput::fail(tolerance, &checked.json), + Err(err) => CheckOutput::error(tolerance, err), + }; + + Ok(o) + }) + .collect::>()?; + + // Write JSON output before the cascade so it's available even on failure. + if let Some(path) = output_file { + let json = serde_json::to_string_pretty(&check_outputs)?; + std::fs::write(path, json) + .with_context(|| format!("failed to write output to \"{}\"", path.display()))?; + } + + // Step 3: If any errors, report all of them and bail. + let mut has_errors = false; + for (i, result) in results.iter().enumerate() { + if let Err(err) = result { + let job = &self.jobs[i]; + writeln!( + output, + "Check {} of {} ({:?}) encountered an error:\n{:?}\n", + i + 1, + self.jobs.len(), + job.regression.name(), + err, + )?; + has_errors = true; + } + } + if has_errors { + return Err(anyhow::anyhow!("one or more checks failed with errors")); + } + + // Step 4: All checks completed. Report any failures. + // (Safe to unwrap since we've handled all Err cases above.) + let mut has_failures = false; + for (i, result) in results.iter().enumerate() { + #[expect( + clippy::expect_used, + reason = "we would have ready returned if errors were present" + )] + let outcome = result + .as_ref() + .expect("no errors should be present any more"); + if let PassFail::Fail(checked) = outcome { + let job = &self.jobs[i]; + writeln!( + output, + "Check {} of {} ({:?}) FAILED:", + i + 1, + self.jobs.len(), + job.regression.name(), + )?; + writeln!(output, "{}", checked.display)?; + writeln!(output)?; + has_failures = true; + } + } + if has_failures { + return Err(anyhow::anyhow!("one or more regression checks failed")); + } + + // Step 5: Everything passed. + for (i, result) in results.iter().enumerate() { + #[expect( + clippy::expect_used, + reason = "we would have returned if errors were present" + )] + let outcome = result + .as_ref() + .expect("no errors should be present any more"); + let PassFail::Pass(checked) = outcome else { + unreachable!("all failures handled above"); + }; + let job = &self.jobs[i]; + writeln!( + output, + "Check {} of {} ({:?}) PASSED:", + i + 1, + self.jobs.len(), + job.regression.name(), + )?; + writeln!(output, "{}", checked.display)?; + writeln!(output)?; + } + + Ok(()) + } +} + +/// Serialized output for a single regression check, suitable for downstream tooling. +/// +/// Positional index in the output array corresponds to the input/tolerance files. +#[derive(Serialize)] +struct CheckOutput<'a> { + status: &'static str, + tolerance: Value, + #[serde(skip_serializing_if = "Option::is_none")] + result: Option<&'a Value>, + #[serde(skip_serializing_if = "Option::is_none")] + error: Option, +} + +impl<'a> CheckOutput<'a> { + fn pass(tolerance: Value, result: &'a Value) -> Self { + Self { + status: "pass", + tolerance, + result: Some(result), + error: None, + } + } + + fn fail(tolerance: Value, result: &'a Value) -> Self { + Self { + status: "fail", + tolerance, + result: Some(result), + error: None, + } + } + + fn error(tolerance: Value, err: &anyhow::Error) -> Self { + let error = err + .chain() + .map(|e| e.to_string()) + .collect::>() + .join(": "); + Self { + status: "error", + tolerance, + result: None, + error: Some(error), + } + } +} + +/////////// +// Tests // +/////////// + +// Note: much of the functionality in this file is related to error handling and relies on +// having a fully functional registry. +// +// To that end, the UX tests are the primary test vessel for much of parsing code. +// The unit tests here stay focused on the bits that are actually feasibly unit testable. +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + /// Construct a vector of `serde_json::Values` with all possible variants. + /// + /// Aggregates `Array` and `Map` are empty. + fn empty_values() -> Vec { + vec![ + Value::Null, + Value::Bool(false), + Value::Number(serde_json::Number::from_f64(0.0).unwrap()), + Value::String(String::new()), + Value::Array(Vec::new()), + Value::Object(serde_json::Map::new()), + ] + } + + #[test] + fn test_is_subset() { + // Null + for v in empty_values() { + if matches!(v, Value::Null) { + assert!(is_subset(&Value::Null, &v)); + } else { + assert!(!is_subset(&Value::Null, &v)); + } + } + + // Bool / Number / String require exact equality and type matches. + assert!(is_subset(&json!(true), &json!(true))); + assert!(!is_subset(&json!(true), &json!(false))); + assert!(!is_subset(&json!(true), &json!(0))); + + assert!(is_subset(&json!(7), &json!(7))); + assert!(!is_subset(&json!(7), &json!(8))); + assert!(!is_subset(&json!(7), &json!("7"))); + + assert!(is_subset(&json!("abc"), &json!("abc"))); + assert!(!is_subset(&json!("abc"), &json!("def"))); + + // Arrays match by prefix. + assert!(is_subset(&json!([1, 2, 3]), &json!([]))); + assert!(is_subset(&json!([1, 2, 3]), &json!([1]))); + assert!(is_subset(&json!([1, 2, 3]), &json!([1, 2]))); + assert!(is_subset(&json!([1, 2, 3]), &json!([1, 2, 3]))); + assert!(!is_subset(&json!([1, 2]), &json!([1, 2, 3]))); + assert!(!is_subset(&json!([1, 2, 3]), &json!([1, 3]))); + + // Objects match by recursive structural subset. + assert!(is_subset(&json!({"a": 1, "b": 2}), &json!({"a": 1}))); + assert!(is_subset(&json!({"a": 1, "b": 2}), &json!({}))); + assert!(is_subset( + &json!({"a": {"b": 1, "c": 2}, "d": 3}), + &json!({"a": {"b": 1}}), + )); + assert!(!is_subset(&json!({"a": 1}), &json!({"a": 1, "b": 2}),)); + assert!(!is_subset(&json!({"a": {"b": 1}}), &json!({"a": {"b": 2}}),)); + + // Nested array/object combinations use the same recursive rules. + assert!(is_subset( + &json!({"ops": [{"kind": "l2", "dim": 128}, {"kind": "cosine", "dim": 256}]}), + &json!({"ops": [{"kind": "l2"}]}), + )); + assert!(is_subset( + &json!({"ops": [{"kind": "l2", "dim": 128}, {"kind": "cosine", "dim": 256}]}), + &json!({"ops": [{"kind": "l2", "dim": 128}, {"kind": "cosine"}]}), + )); + assert!(!is_subset( + &json!({"ops": [{"kind": "l2", "dim": 128}, {"kind": "cosine", "dim": 256}]}), + &json!({"ops": [{"kind": "cosine"}]}), + )); + } + + #[test] + fn test_check_matches_success() { + let result = check_matches(vec![vec![0], vec![1]], vec![vec![0], vec![1]]).unwrap(); + assert_eq!(result, vec![0, 1]); + } + + #[test] + fn test_check_matches_reports_problems_in_stable_order() { + let err = check_matches( + vec![vec![0], vec![], vec![2, 3]], + vec![vec![0], vec![], vec![2, 3]], + ) + .unwrap_err(); + + assert_eq!( + &err.0, + &[ + MatchProblem::OrphanedTolerance(1), + MatchProblem::UncoveredInput(1), + MatchProblem::AmbiguousInput(2, vec![2, 3]), + ] + ) + } +} diff --git a/diskann-benchmark-runner/src/jobs.rs b/diskann-benchmark-runner/src/jobs.rs index 64cdf5f50..185f310ff 100644 --- a/diskann-benchmark-runner/src/jobs.rs +++ b/diskann-benchmark-runner/src/jobs.rs @@ -22,6 +22,11 @@ impl Jobs { &self.jobs } + /// Consume `self`, returning the contained list of jobs. + pub(crate) fn into_inner(self) -> Vec { + self.jobs + } + /// Load `self` from a serialized JSON representation at `path`. /// /// In addition to deserializing the on-disk representation, the method also runs @@ -29,14 +34,18 @@ impl Jobs { /// /// * Resolution of input files. pub(crate) fn load(path: &Path, registry: ®istry::Inputs) -> anyhow::Result { - // Load the raw input. - let partial = Partial::load(path)?; + Self::parse(&Partial::load(path)?, registry) + } + /// Parse `self` from a [`Partial`]. + /// + /// This method also perform deserialization checks on the parsed inputs. + pub(crate) fn parse(partial: &Partial, registry: ®istry::Inputs) -> anyhow::Result { let mut checker = Checker::new( partial .search_directories .iter() - .map(|i| PathBuf::from(&i)) + .map(PathBuf::from) .collect(), partial.output_directory.as_ref().map(PathBuf::from), ); @@ -58,7 +67,7 @@ impl Jobs { let input = registry .get(&unprocessed.tag) .ok_or_else(|| { - anyhow::anyhow!("Un-recognized input tag: \"{}\"", unprocessed.tag) + anyhow::anyhow!("Unrecognized input tag: \"{}\"", unprocessed.tag) }) .with_context(context)?; @@ -87,8 +96,8 @@ impl Jobs { #[derive(Debug, Serialize, Deserialize)] pub(crate) struct Unprocessed { #[serde(rename = "type")] - tag: String, - content: serde_json::Value, + pub(crate) tag: String, + pub(crate) content: serde_json::Value, } impl Unprocessed { @@ -105,6 +114,10 @@ impl Unprocessed { } } +/// A partially loaded input file. +/// +/// To reach this point, we at least the structure of the input JSON to be correct and +/// parseable. However, we have not yet mapped the raw JSON of any of the registered inputs. #[derive(Debug, Serialize, Deserialize)] pub(crate) struct Partial { /// Directories to search for input files. @@ -118,8 +131,10 @@ impl Partial { /// Load `self` from a serialized JSON representation at `path` without post-load /// validation. pub(crate) fn load(path: &Path) -> anyhow::Result { - let file = std::fs::File::open(path)?; - let reader = std::io::BufReader::new(file); - Ok(serde_json::from_reader(reader)?) + crate::internal::load_from_disk(path) + } + + pub(crate) fn jobs(&self) -> &[Unprocessed] { + &self.jobs } } diff --git a/diskann-benchmark-runner/src/lib.rs b/diskann-benchmark-runner/src/lib.rs index ab3f0b676..9b8cf3cdb 100644 --- a/diskann-benchmark-runner/src/lib.rs +++ b/diskann-benchmark-runner/src/lib.rs @@ -5,8 +5,9 @@ //! A moderately functional utility for making simple benchmarking CLI applications. -mod benchmark; +pub mod benchmark; mod checker; +mod internal; mod jobs; mod result; diff --git a/diskann-benchmark-runner/src/registry.rs b/diskann-benchmark-runner/src/registry.rs index 3eab77fcf..73e10c605 100644 --- a/diskann-benchmark-runner/src/registry.rs +++ b/diskann-benchmark-runner/src/registry.rs @@ -3,13 +3,13 @@ * Licensed under the MIT license. */ -use std::collections::HashMap; +use std::collections::{hash_map::Entry, HashMap}; use thiserror::Error; use crate::{ - benchmark::{self, Benchmark, DynBenchmark}, - dispatcher::FailureScore, + benchmark::{self, Benchmark, Regression}, + dispatcher::{FailureScore, MatchScore}, input, Any, Checkpoint, Input, Output, }; @@ -40,8 +40,6 @@ impl Inputs { where T: Input + 'static, { - use std::collections::hash_map::Entry; - let tag = T::tag(); match self.inputs.entry(tag) { Entry::Vacant(entry) => { @@ -71,9 +69,29 @@ impl Default for Inputs { } /// A registered benchmark entry: a name paired with a type-erased benchmark. -struct RegisteredBenchmark { +pub(crate) struct RegisteredBenchmark { name: String, - benchmark: Box, + benchmark: Box, +} + +impl std::fmt::Debug for RegisteredBenchmark { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let benchmark = Capture(&*self.benchmark, None); + f.debug_struct("RegisteredBenchmark") + .field("name", &self.name) + .field("benchmark", &benchmark) + .finish() + } +} + +impl RegisteredBenchmark { + pub(crate) fn name(&self) -> &str { + &self.name + } + + pub(crate) fn benchmark(&self) -> &dyn benchmark::internal::Benchmark { + &*self.benchmark + } } /// A collection of registered benchmarks. @@ -96,7 +114,7 @@ impl Benchmarks { { self.benchmarks.push(RegisteredBenchmark { name: name.into(), - benchmark: Box::new(benchmark::Wrapper::::new()), + benchmark: Box::new(benchmark::internal::Wrapper::::new()), }); } @@ -185,6 +203,58 @@ impl Benchmarks { .min_by_key(|(_, score)| *score) .map(|(entry, _)| entry) } + + //-------------------// + // Regression Checks // + //-------------------// + + /// Register a regression-checkable benchmark with the associated name. + /// + /// Upon registration, the associated [`Regression::Tolerances`] input and the benchmark + /// itself will be reachable via [`Check`](crate::app::Check). + pub fn register_regression(&mut self, name: impl Into) + where + T: Regression + 'static, + { + let registered = benchmark::internal::Wrapper::::new_with( + benchmark::internal::WithRegression::::new(), + ); + self.benchmarks.push(RegisteredBenchmark { + name: name.into(), + benchmark: Box::new(registered), + }); + } + + /// Return a collection of all tolerance related inputs, keyed by the input tag type + /// of the tolerance. + pub(crate) fn tolerances(&self) -> HashMap<&'static str, RegisteredTolerance<'_>> { + let mut tolerances = HashMap::<&'static str, RegisteredTolerance<'_>>::new(); + for b in self.benchmarks.iter() { + if let Some(regression) = b.benchmark.as_regression() { + // If a tolerance input already exists - then simply add this benchmark + // to the list of benchmarks associated with the tolerance. + // + // Otherwise, create a new entry. + let t = regression.tolerance(); + let packaged = RegressionBenchmark { + benchmark: b, + regression, + }; + + match tolerances.entry(t.tag()) { + Entry::Occupied(occupied) => occupied.into_mut().regressions.push(packaged), + Entry::Vacant(vacant) => { + vacant.insert(RegisteredTolerance { + tolerance: input::Registered(t), + regressions: vec![packaged], + }); + } + } + } + } + + tolerances + } } impl Default for Benchmarks { @@ -211,11 +281,61 @@ impl Mismatch { } } -/// Helper to capture a `DynBenchmark::description` call into a `String` via `Display`. -struct Capture<'a>(&'a dyn DynBenchmark, Option<&'a Any>); +//----------// +// Internal // +//----------// + +#[derive(Debug, Clone, Copy)] +pub(crate) struct RegressionBenchmark<'a> { + benchmark: &'a RegisteredBenchmark, + regression: &'a dyn benchmark::internal::Regression, +} + +impl RegressionBenchmark<'_> { + pub(crate) fn name(&self) -> &str { + self.benchmark.name() + } + + pub(crate) fn input_tag(&self) -> &'static str { + self.regression.input_tag() + } + + pub(crate) fn try_match(&self, input: &Any) -> Result { + self.benchmark.benchmark().try_match(input) + } + + pub(crate) fn check( + &self, + tolerance: &Any, + input: &Any, + before: &serde_json::Value, + after: &serde_json::Value, + ) -> anyhow::Result { + self.regression.check(tolerance, input, before, after) + } +} + +#[derive(Debug)] +pub(crate) struct RegisteredTolerance<'a> { + /// The tolerance parser. + pub(crate) tolerance: input::Registered<'a>, + + /// A single tolerance input can apply to multiple benchmarks. This field records all + /// such benchmarks that are available in the registry that use this tolerance. + pub(crate) regressions: Vec>, +} + +/// Helper to capture a `Benchmark::description` call into a `String` via `Display`. +struct Capture<'a>(&'a dyn benchmark::internal::Benchmark, Option<&'a Any>); impl std::fmt::Display for Capture<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.description(f, self.1) } } + +impl std::fmt::Debug for Capture<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.description(f, self.1) + } +} diff --git a/diskann-benchmark-runner/src/result.rs b/diskann-benchmark-runner/src/result.rs index 5be95f8db..cd8e34bb8 100644 --- a/diskann-benchmark-runner/src/result.rs +++ b/diskann-benchmark-runner/src/result.rs @@ -7,7 +7,7 @@ use std::path::Path; -use serde::{ser::SerializeSeq, Serialize, Serializer}; +use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; /// A helper to generate incremental snapshots of data while a benchmark is progressing. /// @@ -113,6 +113,19 @@ where Ok(()) } +/// A utility for loading results previously saved via [`Checkpoint::checkpoint`]. +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct RawResult { + pub(crate) input: serde_json::Value, + pub(crate) results: serde_json::Value, +} + +impl RawResult { + pub(crate) fn load(path: &Path) -> anyhow::Result> { + crate::internal::load_from_disk(path) + } +} + //////////////////////////// // Implementation Details // //////////////////////////// diff --git a/diskann-benchmark-runner/src/test/dim.rs b/diskann-benchmark-runner/src/test/dim.rs new file mode 100644 index 000000000..07e73e2a6 --- /dev/null +++ b/diskann-benchmark-runner/src/test/dim.rs @@ -0,0 +1,193 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use std::io::Write; + +use serde::{Deserialize, Serialize}; + +use crate::{ + benchmark::{PassFail, Regression}, + dispatcher::{FailureScore, MatchScore}, + Any, Benchmark, CheckDeserialization, Checker, Checkpoint, Input, Output, +}; + +/////////// +// Input // +/////////// + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(super) struct DimInput { + dim: Option, +} + +impl DimInput { + fn new(dim: Option) -> Self { + Self { dim } + } + + fn run(&self) -> usize { + self.dim.unwrap_or(usize::MAX) + } +} + +impl Input for DimInput { + fn tag() -> &'static str { + "test-input-dim" + } + + fn try_deserialize( + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(DimInput::deserialize(serialized)?) + } + + fn example() -> anyhow::Result { + Ok(serde_json::to_value(DimInput::new(Some(128)))?) + } +} + +impl CheckDeserialization for DimInput { + fn check_deserialization(&mut self, _checker: &mut Checker) -> anyhow::Result<()> { + Ok(()) + } +} + +/////////////// +// Tolerance // +/////////////// + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(super) struct Tolerance { + succeed: bool, + error_in_check: bool, +} + +impl Input for Tolerance { + fn tag() -> &'static str { + "test-input-dim-tolerance" + } + + fn try_deserialize( + serialized: &serde_json::Value, + _checker: &mut Checker, + ) -> anyhow::Result { + Ok(Any::new(Self::deserialize(serialized)?, Self::tag())) + } + + fn example() -> anyhow::Result { + let this = Self { + succeed: true, + error_in_check: false, + }; + Ok(serde_json::to_value(this)?) + } +} + +//////////////// +// Benchmarks // +//////////////// + +// A simple benchmark that doesn't implement [`Regression`] and only matches `None` variants +// of `DimInput`. +#[derive(Debug)] +pub(super) struct SimpleBench; + +impl Benchmark for SimpleBench { + type Input = DimInput; + type Output = usize; + + fn try_match(input: &DimInput) -> Result { + if input.dim.is_none() { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1000)) + } + } + + fn description(f: &mut std::fmt::Formatter<'_>, input: Option<&DimInput>) -> std::fmt::Result { + match input { + Some(input) if input.dim.is_none() => write!(f, "successful match"), + Some(_) => write!(f, "expected dim=None"), + None => write!(f, "dim=None only"), + } + } + + fn run( + input: &DimInput, + _checkpoint: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + write!(output, "simple bench: {:?}", input.dim)?; + Ok(input.run()) + } +} + +// A more general version of `SimpleBench` that matches all flavors of `dim`. +#[derive(Debug)] +pub(super) struct DimBench; + +impl Benchmark for DimBench { + type Input = DimInput; + type Output = usize; + + fn try_match(_input: &DimInput) -> Result { + Ok(MatchScore(0)) + } + + fn description(f: &mut std::fmt::Formatter<'_>, input: Option<&DimInput>) -> std::fmt::Result { + if input.is_some() { + write!(f, "perfect match") + } else { + write!(f, "matches all") + } + } + + fn run( + input: &DimInput, + _checkpoint: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + write!(output, "dim bench: {:?}", input.dim)?; + Ok(input.run()) + } +} + +impl Regression for DimBench { + type Tolerances = Tolerance; + type Pass = &'static str; + type Fail = &'static str; + + fn check( + tolerance: &Tolerance, + input: &DimInput, + before: &usize, + after: &usize, + ) -> anyhow::Result> { + let Tolerance { + succeed, + error_in_check, + } = tolerance; + if *error_in_check { + anyhow::bail!("simulated check error"); + } + + // This check here mainly serves to verify that the before and after results were + // propagated correctly. + // + // Really, this is a unit test masquerading behind an integration test. + let expected = input.run(); + assert_eq!(*before, expected); + assert_eq!(*after, expected); + + // The success or failure of the benchmark depends on the configuration of the + // tolerance. + if *succeed { + Ok(PassFail::Pass("we did it!")) + } else { + Ok(PassFail::Fail("we didn't do it!")) + } + } +} diff --git a/diskann-benchmark-runner/src/test/mod.rs b/diskann-benchmark-runner/src/test/mod.rs new file mode 100644 index 000000000..540842d4f --- /dev/null +++ b/diskann-benchmark-runner/src/test/mod.rs @@ -0,0 +1,31 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use crate::registry; + +// submodules +mod dim; +mod typed; + +pub(crate) use typed::TypeInput; + +///////// +// API // +///////// + +pub fn register_inputs(inputs: &mut registry::Inputs) -> anyhow::Result<()> { + inputs.register::()?; + inputs.register::()?; + Ok(()) +} + +pub fn register_benchmarks(benchmarks: &mut registry::Benchmarks) { + benchmarks.register_regression::>("type-bench-f32"); + benchmarks.register_regression::>("type-bench-i8"); + benchmarks.register_regression::>("exact-type-bench-f32-1000"); + + benchmarks.register::("simple-bench"); + benchmarks.register_regression::("dim-bench"); +} diff --git a/diskann-benchmark-runner/src/test.rs b/diskann-benchmark-runner/src/test/typed.rs similarity index 60% rename from diskann-benchmark-runner/src/test.rs rename to diskann-benchmark-runner/src/test/typed.rs index 11a08e850..ed49b8b22 100644 --- a/diskann-benchmark-runner/src/test.rs +++ b/diskann-benchmark-runner/src/test/typed.rs @@ -8,39 +8,22 @@ use std::io::Write; use serde::{Deserialize, Serialize}; use crate::{ + benchmark::{PassFail, Regression}, dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, - registry, utils::datatype::{DataType, Type}, Any, Benchmark, CheckDeserialization, Checker, Checkpoint, Input, Output, }; -///////// -// API // -///////// - -pub fn register_inputs(inputs: &mut registry::Inputs) -> anyhow::Result<()> { - inputs.register::()?; - inputs.register::()?; - Ok(()) -} - -pub fn register_benchmarks(benchmarks: &mut registry::Benchmarks) { - benchmarks.register::>("type-bench-f32"); - benchmarks.register::>("type-bench-i8"); - benchmarks.register::>("exact-type-bench-f32-1000"); - benchmarks.register::("dim-bench"); -} - -//////////// -// Inputs // -//////////// +/////////// +// Input // +/////////// #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub(crate) struct TypeInput { - pub(crate) data_type: DataType, - pub(crate) dim: usize, + pub(super) data_type: DataType, + pub(super) dim: usize, // Should we return an error when `check_deserialization` is called? - pub(crate) error_when_checked: bool, + pub(super) error_when_checked: bool, // A flag to verify that [`CheckDeserialization`] has run. #[serde(skip)] pub(crate) checked: bool, @@ -55,6 +38,10 @@ impl TypeInput { checked: false, } } + + fn run(&self) -> &'static str { + self.data_type.as_str() + } } impl Input for TypeInput { @@ -89,37 +76,49 @@ impl CheckDeserialization for TypeInput { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct DimInput { - pub(crate) dim: Option, -} +/////////////// +// Tolerance // +/////////////// -impl DimInput { - pub(crate) fn new(dim: Option) -> Self { - Self { dim } - } +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct Tolerance { + // Should we return an error when `check_deserialization` is called? + pub(super) error_when_checked: bool, + + // A flag to verify that [`CheckDeserialization`] has run. + #[serde(skip)] + pub(crate) checked: bool, } -impl Input for DimInput { +impl Input for Tolerance { fn tag() -> &'static str { - "test-input-dim" + "test-input-types-tolerance" } fn try_deserialize( serialized: &serde_json::Value, checker: &mut Checker, ) -> anyhow::Result { - checker.any(DimInput::deserialize(serialized)?) + checker.any(Self::deserialize(serialized)?) } fn example() -> anyhow::Result { - Ok(serde_json::to_value(DimInput::new(Some(128)))?) + let this = Self { + error_when_checked: false, + checked: false, + }; + Ok(serde_json::to_value(this)?) } } -impl CheckDeserialization for DimInput { +impl CheckDeserialization for Tolerance { fn check_deserialization(&mut self, _checker: &mut Checker) -> anyhow::Result<()> { - Ok(()) + if self.error_when_checked { + Err(anyhow::anyhow!("test input erroring when checked")) + } else { + self.checked = true; + Ok(()) + } } } @@ -128,7 +127,7 @@ impl CheckDeserialization for DimInput { //////////////// #[derive(Debug)] -struct TypeBench(std::marker::PhantomData); +pub(super) struct TypeBench(std::marker::PhantomData); impl Benchmark for TypeBench where @@ -136,7 +135,7 @@ where Type: DispatchRule, { type Input = TypeInput; - type Output = &'static str; + type Output = String; fn try_match(input: &TypeInput) -> Result { // Try to match based on data type. @@ -153,14 +152,42 @@ where checkpoint: Checkpoint<'_>, mut output: &mut dyn Output, ) -> anyhow::Result { - write!(output, "hello: {}", input.data_type.as_str())?; - checkpoint.checkpoint(input.data_type.as_str())?; - Ok(input.data_type.as_str()) + let result = input.run().to_string(); + write!(output, "hello: {}", result)?; + checkpoint.checkpoint(&result)?; + Ok(result) + } +} + +impl Regression for TypeBench +where + T: 'static, + Type: DispatchRule, +{ + type Tolerances = Tolerance; + type Pass = DataType; + type Fail = DataType; + + fn check( + _tolerance: &Tolerance, + input: &TypeInput, + before: &String, + after: &String, + ) -> anyhow::Result> { + // This check here mainly serves to verify that the before and after results were + // propagated correctly. + // + // Really, this is a unit test masquerading behind an integration test. + let expected = input.run(); + assert_eq!(*before, expected); + assert_eq!(*after, expected); + + Ok(PassFail::Pass(input.data_type)) } } #[derive(Debug)] -struct ExactTypeBench(std::marker::PhantomData); +pub(super) struct ExactTypeBench(std::marker::PhantomData); impl Benchmark for ExactTypeBench where @@ -216,31 +243,31 @@ where } } -#[derive(Debug)] -struct DimBench; - -impl Benchmark for DimBench { - type Input = DimInput; - type Output = usize; - - fn try_match(_input: &DimInput) -> Result { - Ok(MatchScore(0)) - } - - fn description(f: &mut std::fmt::Formatter<'_>, input: Option<&DimInput>) -> std::fmt::Result { - if input.is_some() { - write!(f, "perfect match") - } else { - write!(f, "matches all") - } - } +impl Regression for ExactTypeBench +where + T: 'static, + Type: DispatchRule, +{ + type Tolerances = Tolerance; + type Pass = String; + type Fail = String; - fn run( - input: &DimInput, - _checkpoint: Checkpoint<'_>, - mut output: &mut dyn Output, - ) -> anyhow::Result { - write!(output, "dim bench: {:?}", input.dim)?; - Ok(input.dim.unwrap_or(usize::MAX)) + fn check( + _tolerance: &Tolerance, + input: &TypeInput, + before: &String, + after: &String, + ) -> anyhow::Result> { + // Verify correct dispatch: ExactTypeBench produces a different output format than + // TypeBench. If the wrong benchmark was dispatched, the assertion below will catch + // it. + let expected = format!("hello<{}>: {}", N, input.data_type.as_str()); + assert_eq!(*before, expected); + assert_eq!(*after, expected); + + Ok(PassFail::Pass(format!( + "exact match dim={} type={}", + N, input.data_type + ))) } } diff --git a/diskann-benchmark-runner/src/utils/fmt.rs b/diskann-benchmark-runner/src/utils/fmt.rs index e6557871d..e00ec9275 100644 --- a/diskann-benchmark-runner/src/utils/fmt.rs +++ b/diskann-benchmark-runner/src/utils/fmt.rs @@ -59,6 +59,7 @@ impl Table { Row::new(self, row) } + #[expect(clippy::panic, reason = "table interfaces are bounds checked")] fn check_bounds(&self, row: usize, col: usize) { if row >= self.nrows() { panic!("row {} is out of bounds (max {})", row, self.nrows()); diff --git a/diskann-benchmark-runner/src/utils/mod.rs b/diskann-benchmark-runner/src/utils/mod.rs index d6a780330..9f48f9413 100644 --- a/diskann-benchmark-runner/src/utils/mod.rs +++ b/diskann-benchmark-runner/src/utils/mod.rs @@ -6,6 +6,7 @@ pub mod datatype; pub mod fmt; pub mod microseconds; +pub mod num; pub mod percentiles; pub use microseconds::MicroSeconds; diff --git a/diskann-benchmark-runner/src/utils/num.rs b/diskann-benchmark-runner/src/utils/num.rs new file mode 100644 index 000000000..049c2800f --- /dev/null +++ b/diskann-benchmark-runner/src/utils/num.rs @@ -0,0 +1,223 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Number utilities for enforcing deserialization constraints and computing relative errors. + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use thiserror::Error; + +/// Compute the relative change from `before` to `after`. +/// +/// This helper is intentionally opinionated for benchmark-style metrics: +/// +/// - `before` must be finite and strictly positive. +/// - `after` must be finite and non-negative. +/// +/// In other words, this computes: +/// ```text +/// (after - before) / before +/// ``` +/// +/// Negative values indicate improvements while positive values indicate regressions. +pub fn relative_change(before: f64, after: f64) -> Result { + if !before.is_finite() { + return Err(RelativeChangeError::NonFiniteBefore); + } + if before <= 0.0 { + return Err(RelativeChangeError::NonPositiveBefore); + } + + let after = NonNegativeFinite::new(after).map_err(RelativeChangeError::InvalidAfter)?; + let after = after.get(); + + let change = (after - before) / before; + if !change.is_finite() { + return Err(RelativeChangeError::NonFiniteComputedChange); + } + + Ok(change) +} + +/// Error returned when attempting to compute a relative change. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Error)] +pub enum RelativeChangeError { + #[error("expected \"before\" to be a finite number")] + NonFiniteBefore, + #[error("expected \"before\" to be greater than zero")] + NonPositiveBefore, + #[error("invalid \"after\" value: {0}")] + InvalidAfter(InvalidNonNegativeFinite), + #[error("computed relative change is not finite")] + NonFiniteComputedChange, +} + +/// A finite floating-point value that is greater than or equal to zero. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +pub struct NonNegativeFinite(f64); + +impl NonNegativeFinite { + /// Attempt to construct `Self` from `value`. + pub const fn new(value: f64) -> Result { + if !value.is_finite() { + Err(InvalidNonNegativeFinite::NonFinite) + } else if value < 0.0 { + Err(InvalidNonNegativeFinite::Negative) + } else if value == 0.0 { + Ok(Self(0.0)) + } else { + Ok(Self(value)) + } + } + + /// Return the underlying floating-point value. + pub const fn get(self) -> f64 { + self.0 + } +} + +impl std::fmt::Display for NonNegativeFinite { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl TryFrom for NonNegativeFinite { + type Error = InvalidNonNegativeFinite; + + fn try_from(value: f64) -> Result { + Self::new(value) + } +} + +impl From for f64 { + fn from(value: NonNegativeFinite) -> Self { + value.get() + } +} + +impl Serialize for NonNegativeFinite { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_f64(self.0) + } +} + +impl<'de> Deserialize<'de> for NonNegativeFinite { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let value = f64::deserialize(deserializer)?; + Self::new(value).map_err(serde::de::Error::custom) + } +} + +/// Error returned when attempting to construct a [`NonNegativeFinite`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Error)] +pub enum InvalidNonNegativeFinite { + #[error("expected a finite number")] + NonFinite, + #[error("expected a non-negative number")] + Negative, +} + +/////////// +// Tests // +/////////// + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new() { + let to_non_negative = + |x: f64| -> Result { x.try_into() }; + let to_f64 = |x: NonNegativeFinite| -> f64 { x.into() }; + + assert_eq!(NonNegativeFinite::new(0.0).unwrap().get(), 0.0); + assert_eq!(NonNegativeFinite::new(-0.0).unwrap().get(), 0.0); + assert_eq!(NonNegativeFinite::new(0.25).unwrap().get(), 0.25); + + assert_eq!(to_f64(to_non_negative(0.0).unwrap()), 0.0); + assert_eq!(to_f64(to_non_negative(-0.0).unwrap()), 0.0); + assert_eq!(to_f64(to_non_negative(0.25).unwrap()), 0.25); + + assert_eq!(to_non_negative(0.25).unwrap().to_string(), 0.25.to_string()); + + assert_eq!( + NonNegativeFinite::new(-1.0).unwrap_err(), + InvalidNonNegativeFinite::Negative + ); + assert_eq!( + to_non_negative(-1.0).unwrap_err(), + InvalidNonNegativeFinite::Negative + ); + + assert_eq!( + NonNegativeFinite::new(f64::INFINITY).unwrap_err(), + InvalidNonNegativeFinite::NonFinite + ); + assert_eq!( + NonNegativeFinite::new(f64::NEG_INFINITY).unwrap_err(), + InvalidNonNegativeFinite::NonFinite + ); + assert_eq!( + NonNegativeFinite::new(f64::NAN).unwrap_err(), + InvalidNonNegativeFinite::NonFinite + ); + } + + #[test] + fn test_serde() { + let value: NonNegativeFinite = serde_json::from_str("0.1").unwrap(); + assert_eq!(value.get(), 0.1); + + let serialized = serde_json::to_string(&value).unwrap(); + assert_eq!(serialized, "0.1"); + + let err = serde_json::from_str::("-0.5").unwrap_err(); + assert!(err.to_string().contains("expected a non-negative number")); + } + + #[test] + fn test_relative_change() { + assert_eq!(relative_change(10.0, 10.0).unwrap(), 0.0); + assert_eq!(relative_change(10.0, 12.5).unwrap(), 0.25); + assert_eq!(relative_change(10.0, 8.0).unwrap(), -0.2); + assert_eq!(relative_change(10.0, -0.0).unwrap(), -1.0); + + assert_eq!( + relative_change(0.0, 1.0).unwrap_err(), + RelativeChangeError::NonPositiveBefore + ); + assert_eq!( + relative_change(-1.0, 1.0).unwrap_err(), + RelativeChangeError::NonPositiveBefore + ); + assert_eq!( + relative_change(f64::NAN, 1.0).unwrap_err(), + RelativeChangeError::NonFiniteBefore + ); + assert_eq!( + relative_change(f64::INFINITY, 1.0).unwrap_err(), + RelativeChangeError::NonFiniteBefore + ); + assert_eq!( + relative_change(1.0, -1.0).unwrap_err(), + RelativeChangeError::InvalidAfter(InvalidNonNegativeFinite::Negative) + ); + assert_eq!( + relative_change(1.0, f64::NAN).unwrap_err(), + RelativeChangeError::InvalidAfter(InvalidNonNegativeFinite::NonFinite) + ); + assert_eq!( + relative_change(f64::MIN_POSITIVE, f64::MAX).unwrap_err(), + RelativeChangeError::NonFiniteComputedChange + ); + } +} diff --git a/diskann-benchmark-runner/src/utils/percentiles.rs b/diskann-benchmark-runner/src/utils/percentiles.rs index f551837ae..7b519e5f2 100644 --- a/diskann-benchmark-runner/src/utils/percentiles.rs +++ b/diskann-benchmark-runner/src/utils/percentiles.rs @@ -11,7 +11,9 @@ use thiserror::Error; pub struct CannotBeEmpty; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +#[non_exhaustive] pub struct Percentiles { + pub minimum: T, pub mean: f64, pub median: f64, pub p90: T, @@ -77,6 +79,7 @@ where let p99 = x[((99 * len) / 100).min(len - 1)]; Ok(Percentiles { + minimum: x[0], mean, median, p90, @@ -155,6 +158,7 @@ mod tests { let v: &mut [u64] = &mut [10]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 10, mean: 10.0, median: 10.0, p90: 10, @@ -168,6 +172,7 @@ mod tests { let v: &mut [u64] = &mut [2, 1]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 1, mean: 1.5, median: 1.5, p90: 2, @@ -181,6 +186,7 @@ mod tests { let v: &mut [u64] = &mut [2, 1, 3]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 1, mean: 2.0, median: 2.0, p90: 3, @@ -194,6 +200,7 @@ mod tests { let v: &mut [u64] = &mut [2, 1, 3, 4, 9, 6, 7, 5, 8]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 1, mean: 5.0, median: 5.0, p90: 9, @@ -207,6 +214,7 @@ mod tests { let v: &mut [u64] = &mut [2, 10, 1, 3, 4, 9, 6, 7, 5, 8]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 1, mean: 5.5, median: 5.5, p90: 10, @@ -220,6 +228,7 @@ mod tests { let v: &mut [u64] = &mut [2, 10, 1, 3, 4, 9, 6, 11, 7, 5, 8]; let p = compute_percentiles(v).unwrap(); let e = Percentiles { + minimum: 1, mean: 6.0, median: 6.0, p90: 10, diff --git a/diskann-benchmark-runner/src/ux.rs b/diskann-benchmark-runner/src/ux.rs index c29c9617a..1fa221a54 100644 --- a/diskann-benchmark-runner/src/ux.rs +++ b/diskann-benchmark-runner/src/ux.rs @@ -17,6 +17,15 @@ pub fn normalize(s: String) -> String { trimmed.replace("\r\n", "\n") } +/// Replace all occurrences of `path` in `s` with `replacement`. +/// +/// This is useful for scrubbing non-deterministic paths (e.g. temp directories) from test +/// output before comparison. +#[doc(hidden)] +pub fn scrub_path(s: String, path: &std::path::Path, replacement: &str) -> String { + s.replace(&path.display().to_string(), replacement) +} + // There does not appear to be a supported was of checking whether backtraces are // enabled without first actually capturing a backtrace. static BACKTRACE_ENABLED: LazyLock = LazyLock::new(|| { @@ -26,14 +35,14 @@ static BACKTRACE_ENABLED: LazyLock = LazyLock::new(|| { /// Strip the backtrace from the string representation of an [`anyhow::Error`] debug /// diagnostic if running with backtraces enabled. +/// +/// This works even if multiple [`anyhow::Error`]s are present. #[doc(hidden)] pub fn strip_backtrace(s: String) -> String { if !*BACKTRACE_ENABLED { return s; } - // Split into lines until we see `Stack backtrace`, then drop the empty - // // Prints with stack traces will looks something like // ``` // while processing input 2 of 2 @@ -42,29 +51,34 @@ pub fn strip_backtrace(s: String) -> String { // unknown variant `f32`, expected one of `float64`, `float32`, // // Stack backtrace: - // 0: + // 0: somestuff + // more stuff + // maybe a note + // // ``` - // This works by splitting the output into lines - looking for the keyword - // `Stack backtrace` and taking all lines up to that point. - let mut stacktrace_found = false; + // Importantly, there is an empty line after the stacktrace is finished. + // + // The loop simply looks for the `Stack backtrace:` line and then ignores lines from + // that point on until an empty line is observed. + // + // This seems to handle cases where printouts have multiple errors just fine. + let mut in_stacktrace = false; let lines: Vec<_> = s .lines() - .take_while(|l| { - stacktrace_found = *l == "Stack backtrace:"; - !stacktrace_found + .filter(|l| { + if in_stacktrace { + if l.is_empty() { + in_stacktrace = false; + } + false + } else if *l == "Stack backtrace:" { + in_stacktrace = true; + false + } else { + true + } }) .collect(); - if lines.is_empty() { - String::new() - } else if stacktrace_found { - // When `anyhow` inserts a backtrace - it separates the body of the error from - // the stack trace with a newline. This strips that newline. - // - // Indexing is okay because we've already handled the empty case. - lines[..lines.len() - 1].join("\n") - } else { - // No stacktrace found - do not strip a trailing empty line. - lines.join("\n") - } + lines.join("\n") } diff --git a/diskann-benchmark-runner/tests/test-0/README.md b/diskann-benchmark-runner/tests/benchmark/test-0/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-0/README.md rename to diskann-benchmark-runner/tests/benchmark/test-0/README.md diff --git a/diskann-benchmark-runner/tests/test-0/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-0/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-0/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-0/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-0/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-0/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-0/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-0/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-1/README.md b/diskann-benchmark-runner/tests/benchmark/test-1/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-1/README.md rename to diskann-benchmark-runner/tests/benchmark/test-1/README.md diff --git a/diskann-benchmark-runner/tests/test-1/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-1/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-1/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-1/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-1/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-1/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-1/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-1/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-2/README.md b/diskann-benchmark-runner/tests/benchmark/test-2/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-2/README.md rename to diskann-benchmark-runner/tests/benchmark/test-2/README.md diff --git a/diskann-benchmark-runner/tests/test-2/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-2/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-2/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-2/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-2/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-2/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-2/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-2/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-3/README.md b/diskann-benchmark-runner/tests/benchmark/test-3/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-3/README.md rename to diskann-benchmark-runner/tests/benchmark/test-3/README.md diff --git a/diskann-benchmark-runner/tests/test-3/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-3/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-3/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-3/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-3/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-3/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-3/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-3/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-4/README.md b/diskann-benchmark-runner/tests/benchmark/test-4/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-4/README.md rename to diskann-benchmark-runner/tests/benchmark/test-4/README.md diff --git a/diskann-benchmark-runner/tests/test-4/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-4/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-4/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-4/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-4/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-4/stdout.txt similarity index 81% rename from diskann-benchmark-runner/tests/test-4/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-4/stdout.txt index edb313c6b..0a37ae5a5 100644 --- a/diskann-benchmark-runner/tests/test-4/stdout.txt +++ b/diskann-benchmark-runner/tests/benchmark/test-4/stdout.txt @@ -5,5 +5,7 @@ Registered Benchmarks: int8 exact-type-bench-f32-1000: tag "test-input-types" float32, dim=1000 + simple-bench: tag "test-input-dim" + dim=None only dim-bench: tag "test-input-dim" matches all \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/test-deserialization-error-0/README.md b/diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-deserialization-error-0/README.md rename to diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/README.md diff --git a/diskann-benchmark-runner/tests/test-deserialization-error-0/input.json b/diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-deserialization-error-0/input.json rename to diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/input.json diff --git a/diskann-benchmark-runner/tests/test-deserialization-error-0/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-deserialization-error-0/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-deserialization-error-0/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-deserialization-error-0/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-deserialization-error-0/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-mismatch-0/README.md b/diskann-benchmark-runner/tests/benchmark/test-mismatch-0/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-0/README.md rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-0/README.md diff --git a/diskann-benchmark-runner/tests/test-mismatch-0/input.json b/diskann-benchmark-runner/tests/benchmark/test-mismatch-0/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-0/input.json rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-0/input.json diff --git a/diskann-benchmark-runner/tests/test-mismatch-0/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-mismatch-0/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-0/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-0/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-mismatch-0/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-mismatch-0/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-0/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-0/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-mismatch-1/README.md b/diskann-benchmark-runner/tests/benchmark/test-mismatch-1/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-1/README.md rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-1/README.md diff --git a/diskann-benchmark-runner/tests/test-mismatch-1/input.json b/diskann-benchmark-runner/tests/benchmark/test-mismatch-1/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-1/input.json rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-1/input.json diff --git a/diskann-benchmark-runner/tests/test-mismatch-1/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-mismatch-1/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-1/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-1/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-mismatch-1/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-mismatch-1/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-mismatch-1/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-mismatch-1/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-overload-0/README.md b/diskann-benchmark-runner/tests/benchmark/test-overload-0/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-overload-0/README.md rename to diskann-benchmark-runner/tests/benchmark/test-overload-0/README.md diff --git a/diskann-benchmark-runner/tests/test-overload-0/input.json b/diskann-benchmark-runner/tests/benchmark/test-overload-0/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-overload-0/input.json rename to diskann-benchmark-runner/tests/benchmark/test-overload-0/input.json diff --git a/diskann-benchmark-runner/tests/test-overload-0/output.json b/diskann-benchmark-runner/tests/benchmark/test-overload-0/output.json similarity index 100% rename from diskann-benchmark-runner/tests/test-overload-0/output.json rename to diskann-benchmark-runner/tests/benchmark/test-overload-0/output.json diff --git a/diskann-benchmark-runner/tests/test-overload-0/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-overload-0/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-overload-0/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-overload-0/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-overload-0/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-overload-0/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-overload-0/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-overload-0/stdout.txt diff --git a/diskann-benchmark-runner/tests/test-success-0/README.md b/diskann-benchmark-runner/tests/benchmark/test-success-0/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-success-0/README.md rename to diskann-benchmark-runner/tests/benchmark/test-success-0/README.md diff --git a/diskann-benchmark-runner/tests/test-success-0/input.json b/diskann-benchmark-runner/tests/benchmark/test-success-0/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-success-0/input.json rename to diskann-benchmark-runner/tests/benchmark/test-success-0/input.json diff --git a/diskann-benchmark-runner/tests/test-success-0/output.json b/diskann-benchmark-runner/tests/benchmark/test-success-0/output.json similarity index 100% rename from diskann-benchmark-runner/tests/test-success-0/output.json rename to diskann-benchmark-runner/tests/benchmark/test-success-0/output.json diff --git a/diskann-benchmark-runner/tests/test-success-0/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-success-0/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-success-0/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-success-0/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-success-0/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-success-0/stdout.txt similarity index 94% rename from diskann-benchmark-runner/tests/test-success-0/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-success-0/stdout.txt index fca27c0ad..ecc7b4aff 100644 --- a/diskann-benchmark-runner/tests/test-success-0/stdout.txt +++ b/diskann-benchmark-runner/tests/benchmark/test-success-0/stdout.txt @@ -2,7 +2,7 @@ # Running Job 1 of 4 # ###################### -dim bench: None +simple bench: None ###################### # Running Job 2 of 4 # diff --git a/diskann-benchmark-runner/tests/test-success-1/README.md b/diskann-benchmark-runner/tests/benchmark/test-success-1/README.md similarity index 100% rename from diskann-benchmark-runner/tests/test-success-1/README.md rename to diskann-benchmark-runner/tests/benchmark/test-success-1/README.md diff --git a/diskann-benchmark-runner/tests/test-success-1/input.json b/diskann-benchmark-runner/tests/benchmark/test-success-1/input.json similarity index 100% rename from diskann-benchmark-runner/tests/test-success-1/input.json rename to diskann-benchmark-runner/tests/benchmark/test-success-1/input.json diff --git a/diskann-benchmark-runner/tests/test-success-1/stdin.txt b/diskann-benchmark-runner/tests/benchmark/test-success-1/stdin.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-success-1/stdin.txt rename to diskann-benchmark-runner/tests/benchmark/test-success-1/stdin.txt diff --git a/diskann-benchmark-runner/tests/test-success-1/stdout.txt b/diskann-benchmark-runner/tests/benchmark/test-success-1/stdout.txt similarity index 100% rename from diskann-benchmark-runner/tests/test-success-1/stdout.txt rename to diskann-benchmark-runner/tests/benchmark/test-success-1/stdout.txt diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/README.md b/diskann-benchmark-runner/tests/regression/check-run-error-0/README.md new file mode 100644 index 000000000..023a5096b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/README.md @@ -0,0 +1,5 @@ +In this test - we verify that if a check fails with an error, we + +- Only print the errors for the failing tests (to avoid spamming stdout) +- Record which of the entries failed (there can be multiple) +- Verify that errors get recorded in the output `checks.json` file. diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/checks.json b/diskann-benchmark-runner/tests/regression/check-run-error-0/checks.json new file mode 100644 index 000000000..a0819f280 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/checks.json @@ -0,0 +1,25 @@ +[ + { + "status": "error", + "tolerance": { + "error_in_check": true, + "succeed": true + }, + "error": "simulated check error" + }, + { + "status": "pass", + "tolerance": { + "error_when_checked": false + }, + "result": "int8" + }, + { + "status": "error", + "tolerance": { + "error_in_check": true, + "succeed": true + }, + "error": "simulated check error" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/input.json b/diskann-benchmark-runner/tests/regression/check-run-error-0/input.json new file mode 100644 index 000000000..1ca140edb --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/input.json @@ -0,0 +1,26 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + } + }, + { + "type": "test-input-dim", + "content": { + "dim": 256 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/output.json b/diskann-benchmark-runner/tests/regression/check-run-error-0/output.json new file mode 100644 index 000000000..9bad43329 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/output.json @@ -0,0 +1,31 @@ +[ + { + "input": { + "content": { + "dim": 128 + }, + "type": "test-input-dim" + }, + "results": 128 + }, + { + "input": { + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + }, + "type": "test-input-types" + }, + "results": "int8" + }, + { + "input": { + "content": { + "dim": 256 + }, + "type": "test-input-dim" + }, + "results": 256 + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-error-0/stdin.txt new file mode 100644 index 000000000..b94f63553 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate the output file from the benchmark inputs. +run --input-file $INPUT --output-file $OUTPUT + +# Test: the dim tolerance has error_in_check=true, triggering an infrastructure error. +check run --tolerances $TOLERANCES --input-file $INPUT --before $OUTPUT --after $OUTPUT --output-file $CHECK_OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-error-0/stdout.txt new file mode 100644 index 000000000..9e606b9bc --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/stdout.txt @@ -0,0 +1,7 @@ +Check 1 of 3 ("dim-bench") encountered an error: +simulated check error + +Check 3 of 3 ("dim-bench") encountered an error: +simulated check error + +one or more checks failed with errors \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-0/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-error-0/tolerances.json new file mode 100644 index 000000000..c659f2eb5 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-0/tolerances.json @@ -0,0 +1,31 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": {} + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": true + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "int8" + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/README.md b/diskann-benchmark-runner/tests/regression/check-run-error-1/README.md new file mode 100644 index 000000000..f8c7659be --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/README.md @@ -0,0 +1,5 @@ +In this test is a little complicated. What we do is use `input.json` to generate an `output.json`. +Then during the check run, we use a different `regression_input.json`. +This second file as a different number of entries. + +The test then verifies that we properly reject this situation. diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/input.json b/diskann-benchmark-runner/tests/regression/check-run-error-1/input.json new file mode 100644 index 000000000..6ab0e3c81 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/input.json @@ -0,0 +1,18 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-dim", + "content": { + "dim": 512 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/output.json b/diskann-benchmark-runner/tests/regression/check-run-error-1/output.json new file mode 100644 index 000000000..7fb4c50ec --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/output.json @@ -0,0 +1,20 @@ +[ + { + "input": { + "content": { + "dim": 128 + }, + "type": "test-input-dim" + }, + "results": 128 + }, + { + "input": { + "content": { + "dim": 512 + }, + "type": "test-input-dim" + }, + "results": 512 + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/regression_input.json b/diskann-benchmark-runner/tests/regression/check-run-error-1/regression_input.json new file mode 100644 index 000000000..2461536e2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/regression_input.json @@ -0,0 +1,12 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-error-1/stdin.txt new file mode 100644 index 000000000..72a30bc83 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate output from the 2-job input file. +run --input-file $INPUT --output-file $OUTPUT + +# Test: check run uses $REGRESSION_INPUT (1 job) but $OUTPUT has 2 entries → length mismatch. +check run --tolerances $TOLERANCES --input-file $REGRESSION_INPUT --before $OUTPUT --after $OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-error-1/stdout.txt new file mode 100644 index 000000000..58911fb45 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/stdout.txt @@ -0,0 +1 @@ +"before" file "$TEMPDIR/output.json" has 2 entries but expected 1 \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-1/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-error-1/tolerances.json new file mode 100644 index 000000000..325dc038c --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-1/tolerances.json @@ -0,0 +1,19 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/README.md b/diskann-benchmark-runner/tests/regression/check-run-error-2/README.md new file mode 100644 index 000000000..58f30f6bc --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/README.md @@ -0,0 +1,5 @@ +This test covers input drift after output generation. + +The tolerance/input tags are still compatible, but the regression input has changed to +`float64`, which no registered regression benchmark supports. `check run` should fail with +an explicit "no matching regression benchmark" diagnostic instead of panicking. diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/input.json b/diskann-benchmark-runner/tests/regression/check-run-error-2/input.json new file mode 100644 index 000000000..2637c1a73 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/input.json @@ -0,0 +1,14 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/output.json b/diskann-benchmark-runner/tests/regression/check-run-error-2/output.json new file mode 100644 index 000000000..869ae3cc8 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/output.json @@ -0,0 +1,13 @@ +[ + { + "input": { + "content": { + "data_type": "float32", + "dim": 128, + "error_when_checked": false + }, + "type": "test-input-types" + }, + "results": "float32" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/regression_input.json b/diskann-benchmark-runner/tests/regression/check-run-error-2/regression_input.json new file mode 100644 index 000000000..fccec5639 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/regression_input.json @@ -0,0 +1,14 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-types", + "content": { + "data_type": "float64", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-error-2/stdin.txt new file mode 100644 index 000000000..57d237fa7 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate output from a valid regression-capable float32 input. +run --input-file $INPUT --output-file $OUTPUT + +# Test: the regression input has drifted to float64, so no registered regression benchmark matches it. +check run --tolerances $TOLERANCES --input-file $REGRESSION_INPUT --before $OUTPUT --after $OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-error-2/stdout.txt new file mode 100644 index 000000000..d44c221c0 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/stdout.txt @@ -0,0 +1 @@ +Could not match input tag "test-input-types" and tolerance tag "test-input-types-tolerance" to a valid benchmark. This likely means file or code changes between when the input file was last used. If the normal benchmark flow succeeds, please report this issue. \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-2/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-error-2/tolerances.json new file mode 100644 index 000000000..e7ff6930a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-2/tolerances.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-types", + "content": {} + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/README.md b/diskann-benchmark-runner/tests/regression/check-run-error-3/README.md new file mode 100644 index 000000000..26b72cec1 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/README.md @@ -0,0 +1,6 @@ +This test covers before/after schema drift with matching entry counts. + +The setup output is generated from a `dim` benchmark, producing integer results. The +regression check is then run against a `test-input-types` benchmark, which expects string +results. The check should report a structured deserialization error and still write +`checks.json`. diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/checks.json b/diskann-benchmark-runner/tests/regression/check-run-error-3/checks.json new file mode 100644 index 000000000..c14e9835b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/checks.json @@ -0,0 +1,9 @@ +[ + { + "status": "error", + "tolerance": { + "error_when_checked": false + }, + "error": "the \"before\" results do not match the output schema expected by this benchmark: invalid type: integer `128`, expected a string" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/input.json b/diskann-benchmark-runner/tests/regression/check-run-error-3/input.json new file mode 100644 index 000000000..2461536e2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/input.json @@ -0,0 +1,12 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/output.json b/diskann-benchmark-runner/tests/regression/check-run-error-3/output.json new file mode 100644 index 000000000..84c2a38e1 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/output.json @@ -0,0 +1,11 @@ +[ + { + "input": { + "content": { + "dim": 128 + }, + "type": "test-input-dim" + }, + "results": 128 + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/regression_input.json b/diskann-benchmark-runner/tests/regression/check-run-error-3/regression_input.json new file mode 100644 index 000000000..2637c1a73 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/regression_input.json @@ -0,0 +1,14 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-error-3/stdin.txt new file mode 100644 index 000000000..7f5960c84 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate output with integer results from the dim benchmark. +run --input-file $INPUT --output-file $OUTPUT + +# Test: reuse that output for a type benchmark, so the result payload shape is wrong. +check run --tolerances $TOLERANCES --input-file $REGRESSION_INPUT --before $OUTPUT --after $OUTPUT --output-file $CHECK_OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-error-3/stdout.txt new file mode 100644 index 000000000..6b12351ae --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/stdout.txt @@ -0,0 +1,7 @@ +Check 1 of 1 ("type-bench-f32") encountered an error: +the "before" results do not match the output schema expected by this benchmark + +Caused by: + invalid type: integer `128`, expected a string + +one or more checks failed with errors \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-error-3/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-error-3/tolerances.json new file mode 100644 index 000000000..e2b8f704e --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-error-3/tolerances.json @@ -0,0 +1,18 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "float32" + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/README.md b/diskann-benchmark-runner/tests/regression/check-run-fail-0/README.md new file mode 100644 index 000000000..11b617a9a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/README.md @@ -0,0 +1,2 @@ +In this test, we check that is a check fails, only the failing output(s) are printed. +But even if there is a failure, we still generate the `checks.json` output file. diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/checks.json b/diskann-benchmark-runner/tests/regression/check-run-fail-0/checks.json new file mode 100644 index 000000000..181de2f8c --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/checks.json @@ -0,0 +1,17 @@ +[ + { + "status": "fail", + "tolerance": { + "error_in_check": false, + "succeed": false + }, + "result": "we didn't do it!" + }, + { + "status": "pass", + "tolerance": { + "error_when_checked": false + }, + "result": "int8" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/input.json b/diskann-benchmark-runner/tests/regression/check-run-fail-0/input.json new file mode 100644 index 000000000..087444d2b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/input.json @@ -0,0 +1,20 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/output.json b/diskann-benchmark-runner/tests/regression/check-run-fail-0/output.json new file mode 100644 index 000000000..5112717a9 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/output.json @@ -0,0 +1,22 @@ +[ + { + "input": { + "content": { + "dim": 128 + }, + "type": "test-input-dim" + }, + "results": 128 + }, + { + "input": { + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + }, + "type": "test-input-types" + }, + "results": "int8" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdin.txt new file mode 100644 index 000000000..fe474b7c2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate the output file from the benchmark inputs. +run --input-file $INPUT --output-file $OUTPUT + +# Test: run regression checks — the dim tolerance has succeed=false so it should fail. +check run --tolerances $TOLERANCES --input-file $INPUT --before $OUTPUT --after $OUTPUT --output-file $CHECK_OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdout.txt new file mode 100644 index 000000000..36247a654 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/stdout.txt @@ -0,0 +1,4 @@ +Check 1 of 2 ("dim-bench") FAILED: +we didn't do it! + +one or more regression checks failed \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-fail-0/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-fail-0/tolerances.json new file mode 100644 index 000000000..a3eeff14c --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-fail-0/tolerances.json @@ -0,0 +1,33 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": false, + "error_in_check": false + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "int8" + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/README.md b/diskann-benchmark-runner/tests/regression/check-run-pass-0/README.md new file mode 100644 index 000000000..be9ed2938 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/README.md @@ -0,0 +1,9 @@ +There are several things being tested here. + +First, we check that the "test-input-dim-tolerance" can match multiple instances of the +"test-input-dim" input. + +Next, we have two tests for "test-input-types" - one of which is more specific than the other. +This tests the situation where multiple regression tests have the same input and tolerance file types, but we still need to run matching on the input to find the best one. + +Finally, this tests the printing and output generation of success. diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/checks.json b/diskann-benchmark-runner/tests/regression/check-run-pass-0/checks.json new file mode 100644 index 000000000..d66eda403 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/checks.json @@ -0,0 +1,32 @@ +[ + { + "status": "pass", + "tolerance": { + "error_in_check": false, + "succeed": true + }, + "result": "we did it!" + }, + { + "status": "pass", + "tolerance": { + "error_when_checked": false + }, + "result": "exact match dim=1000 type=float32" + }, + { + "status": "pass", + "tolerance": { + "error_when_checked": false + }, + "result": "int8" + }, + { + "status": "pass", + "tolerance": { + "error_in_check": false, + "succeed": true + }, + "result": "we did it!" + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/input.json b/diskann-benchmark-runner/tests/regression/check-run-pass-0/input.json new file mode 100644 index 000000000..2be7ad05d --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/input.json @@ -0,0 +1,34 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 1000, + "error_when_checked": false + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + } + }, + { + "type": "test-input-dim", + "content": { + "dim": 512 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/output.json b/diskann-benchmark-runner/tests/regression/check-run-pass-0/output.json new file mode 100644 index 000000000..5386e9dee --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/output.json @@ -0,0 +1,42 @@ +[ + { + "input": { + "content": { + "dim": 128 + }, + "type": "test-input-dim" + }, + "results": 128 + }, + { + "input": { + "content": { + "data_type": "float32", + "dim": 1000, + "error_when_checked": false + }, + "type": "test-input-types" + }, + "results": "hello<1000>: float32" + }, + { + "input": { + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + }, + "type": "test-input-types" + }, + "results": "int8" + }, + { + "input": { + "content": { + "dim": 512 + }, + "type": "test-input-dim" + }, + "results": 512 + } +] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdin.txt new file mode 100644 index 000000000..3712e2ba8 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdin.txt @@ -0,0 +1,5 @@ +# Setup: generate the output file from the benchmark inputs. +run --input-file $INPUT --output-file $OUTPUT + +# Test: run regression checks using the generated output as both before and after. +check run --tolerances $TOLERANCES --input-file $INPUT --before $OUTPUT --after $OUTPUT --output-file $CHECK_OUTPUT diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdout.txt new file mode 100644 index 000000000..6ccd9b38b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/stdout.txt @@ -0,0 +1,11 @@ +Check 1 of 4 ("dim-bench") PASSED: +we did it! + +Check 2 of 4 ("exact-type-bench-f32-1000") PASSED: +exact match dim=1000 type=float32 + +Check 3 of 4 ("type-bench-i8") PASSED: +int8 + +Check 4 of 4 ("dim-bench") PASSED: +we did it! \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-run-pass-0/tolerances.json b/diskann-benchmark-runner/tests/regression/check-run-pass-0/tolerances.json new file mode 100644 index 000000000..82fd4aa78 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-run-pass-0/tolerances.json @@ -0,0 +1,46 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": {} + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 1000 + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "int8" + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-skeleton-0/README.md b/diskann-benchmark-runner/tests/regression/check-skeleton-0/README.md new file mode 100644 index 000000000..1df588bd2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-skeleton-0/README.md @@ -0,0 +1 @@ +Test simple skeleton file printing. diff --git a/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdin.txt new file mode 100644 index 000000000..2b38fb36e --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdin.txt @@ -0,0 +1 @@ +check skeleton diff --git a/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdout.txt new file mode 100644 index 000000000..e4ba0a79a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-skeleton-0/stdout.txt @@ -0,0 +1,11 @@ +Skeleton tolerance file. + +Each tolerance is paired with an input that is structurally +matched with an entry in the corresponding `--input-file`. + +This allow a single tolerance entry to be applied to multiple +benchmark runs as long as this structural mapping is unambiguous. + +{ + "checks": [] +} \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-0/README.md b/diskann-benchmark-runner/tests/regression/check-tolerances-0/README.md new file mode 100644 index 000000000..73bcfc486 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-0/README.md @@ -0,0 +1 @@ +Test printing of all registered tolerances. diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdin.txt new file mode 100644 index 000000000..c3feba3e9 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdin.txt @@ -0,0 +1 @@ +check tolerances diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdout.txt new file mode 100644 index 000000000..f090d1dc6 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-0/stdout.txt @@ -0,0 +1,7 @@ +Available tolerance kinds are listed below. + test-input-dim-tolerance + - "test-input-dim" => "dim-bench" + test-input-types-tolerance + - "test-input-types" => "type-bench-f32" + - "test-input-types" => "type-bench-i8" + - "test-input-types" => "exact-type-bench-f32-1000" \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-1/README.md b/diskann-benchmark-runner/tests/regression/check-tolerances-1/README.md new file mode 100644 index 000000000..4ae0d9f7b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-1/README.md @@ -0,0 +1 @@ +Checking the tolerance of a registered type prints it's JSON representation. diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdin.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdin.txt new file mode 100644 index 000000000..880a11299 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdin.txt @@ -0,0 +1 @@ +check tolerances test-input-types-tolerance diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdout.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdout.txt new file mode 100644 index 000000000..017365a8a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-1/stdout.txt @@ -0,0 +1,16 @@ +The example JSON representation for "test-input-types-tolerance" is shown below. +Populate the "input" field with a compatible benchmark input. +Matching will be performed by partial structural map on the input. + +{ + "input": { + "type": "", + "content": {} + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } +} \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-2/README.md b/diskann-benchmark-runner/tests/regression/check-tolerances-2/README.md new file mode 100644 index 000000000..66d8f8743 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-2/README.md @@ -0,0 +1 @@ +Error message when requesting a tolerance input that does not exist. diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdin.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdin.txt new file mode 100644 index 000000000..1c8f6d9b5 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdin.txt @@ -0,0 +1 @@ +check tolerances nonexistent-tolerance diff --git a/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdout.txt b/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdout.txt new file mode 100644 index 000000000..3c12a8fc8 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-tolerances-2/stdout.txt @@ -0,0 +1 @@ +No tolerance input found for "nonexistent-tolerance" \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-verify-0/README.md b/diskann-benchmark-runner/tests/regression/check-verify-0/README.md new file mode 100644 index 000000000..62dc39df2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-0/README.md @@ -0,0 +1 @@ +Check that verification doesn't print anything on success. diff --git a/diskann-benchmark-runner/tests/regression/check-verify-0/input.json b/diskann-benchmark-runner/tests/regression/check-verify-0/input.json new file mode 100644 index 000000000..9577db08f --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-0/input.json @@ -0,0 +1,28 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 1000, + "error_when_checked": false + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-0/stdin.txt b/diskann-benchmark-runner/tests/regression/check-verify-0/stdin.txt new file mode 100644 index 000000000..8d79980a6 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-0/stdin.txt @@ -0,0 +1 @@ +check verify --tolerances $TOLERANCES --input-file $INPUT diff --git a/diskann-benchmark-runner/tests/regression/check-verify-0/stdout.txt b/diskann-benchmark-runner/tests/regression/check-verify-0/stdout.txt new file mode 100644 index 000000000..e69de29bb diff --git a/diskann-benchmark-runner/tests/regression/check-verify-0/tolerances.json b/diskann-benchmark-runner/tests/regression/check-verify-0/tolerances.json new file mode 100644 index 000000000..65e7092bd --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-0/tolerances.json @@ -0,0 +1,48 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "float32", + "dim": 1000 + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + }, + { + "input": { + "type": "test-input-types", + "content": { + "data_type": "int8" + } + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-1/README.md b/diskann-benchmark-runner/tests/regression/check-verify-1/README.md new file mode 100644 index 000000000..6c2e4a4ad --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-1/README.md @@ -0,0 +1 @@ +Error when attempting to parse an tolerance that is not registered. diff --git a/diskann-benchmark-runner/tests/regression/check-verify-1/input.json b/diskann-benchmark-runner/tests/regression/check-verify-1/input.json new file mode 100644 index 000000000..2461536e2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-1/input.json @@ -0,0 +1,12 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-1/stdin.txt b/diskann-benchmark-runner/tests/regression/check-verify-1/stdin.txt new file mode 100644 index 000000000..8d79980a6 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-1/stdin.txt @@ -0,0 +1 @@ +check verify --tolerances $TOLERANCES --input-file $INPUT diff --git a/diskann-benchmark-runner/tests/regression/check-verify-1/stdout.txt b/diskann-benchmark-runner/tests/regression/check-verify-1/stdout.txt new file mode 100644 index 000000000..bf62ca0de --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-1/stdout.txt @@ -0,0 +1,4 @@ +while processing tolerance input 1 of 1 + +Caused by: + Unrecognized tolerance tag: "nonexistent-tolerance" \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-verify-1/tolerances.json b/diskann-benchmark-runner/tests/regression/check-verify-1/tolerances.json new file mode 100644 index 000000000..e46b1409b --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-1/tolerances.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + "tolerance": { + "type": "nonexistent-tolerance", + "content": {} + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-2/README.md b/diskann-benchmark-runner/tests/regression/check-verify-2/README.md new file mode 100644 index 000000000..7b8dfda1a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-2/README.md @@ -0,0 +1,2 @@ +This test exercises a scenario where the `input.json` is plausible (matches successfully with +a tolerance), but it doesn't actually match any backend benchmark. diff --git a/diskann-benchmark-runner/tests/regression/check-verify-2/input.json b/diskann-benchmark-runner/tests/regression/check-verify-2/input.json new file mode 100644 index 000000000..d02980026 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-2/input.json @@ -0,0 +1,14 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-types", + "content": { + "data_type": "float16", + "dim": 128, + "error_when_checked": false + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-2/stdin.txt b/diskann-benchmark-runner/tests/regression/check-verify-2/stdin.txt new file mode 100644 index 000000000..3c3ea9a56 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-2/stdin.txt @@ -0,0 +1,3 @@ +# The tolerance needle matches the float16 input via is_subset, but no registered +# regression benchmark accepts float16 — so benchmark dispatch fails. +check verify --tolerances $TOLERANCES --input-file $INPUT diff --git a/diskann-benchmark-runner/tests/regression/check-verify-2/stdout.txt b/diskann-benchmark-runner/tests/regression/check-verify-2/stdout.txt new file mode 100644 index 000000000..d44c221c0 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-2/stdout.txt @@ -0,0 +1 @@ +Could not match input tag "test-input-types" and tolerance tag "test-input-types-tolerance" to a valid benchmark. This likely means file or code changes between when the input file was last used. If the normal benchmark flow succeeds, please report this issue. \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-verify-2/tolerances.json b/diskann-benchmark-runner/tests/regression/check-verify-2/tolerances.json new file mode 100644 index 000000000..e7ff6930a --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-2/tolerances.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-types", + "content": {} + }, + "tolerance": { + "type": "test-input-types-tolerance", + "content": { + "error_when_checked": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-3/input.json b/diskann-benchmark-runner/tests/regression/check-verify-3/input.json new file mode 100644 index 000000000..80d764186 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-3/input.json @@ -0,0 +1,26 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + }, + { + "type": "test-input-types", + "content": { + "data_type": "int8", + "dim": 128, + "error_when_checked": false + } + }, + { + "type": "test-input-dim", + "content": { + "dim": 512 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-3/stdin.txt b/diskann-benchmark-runner/tests/regression/check-verify-3/stdin.txt new file mode 100644 index 000000000..94a9456ca --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-3/stdin.txt @@ -0,0 +1,5 @@ +# All three MatchProblem variants in one test: +# - Tolerance 1 (dim=9999) matches no inputs → OrphanedTolerance +# - Input 2 (int8) has no dim tolerance → UncoveredInput +# - Inputs 1 and 3 (dim) match tolerances 2 and 3 → AmbiguousInput +check verify --tolerances $TOLERANCES --input-file $INPUT diff --git a/diskann-benchmark-runner/tests/regression/check-verify-3/stdout.txt b/diskann-benchmark-runner/tests/regression/check-verify-3/stdout.txt new file mode 100644 index 000000000..2efd1f725 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-3/stdout.txt @@ -0,0 +1,5 @@ +tolerance matching failed: + tolerance 1 matched no inputs + input 1 matched tolerances 2, 3 + input 2 matched no tolerances + input 3 matched tolerances 2, 3 \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-verify-3/tolerances.json b/diskann-benchmark-runner/tests/regression/check-verify-3/tolerances.json new file mode 100644 index 000000000..4b6577a09 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-3/tolerances.json @@ -0,0 +1,45 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-dim", + "content": { + "dim": 9999 + } + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + }, + { + "input": { + "type": "test-input-dim", + "content": {} + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + }, + { + "input": { + "type": "test-input-dim", + "content": {} + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-4/README.md b/diskann-benchmark-runner/tests/regression/check-verify-4/README.md new file mode 100644 index 000000000..f9ad4be44 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-4/README.md @@ -0,0 +1,2 @@ +Here, we test that if a valid "tolerance" is linked with a valid "input" in "tolerance.json" +but there is not registered benchmark where these two tags are linked, we get a reasonable error. diff --git a/diskann-benchmark-runner/tests/regression/check-verify-4/input.json b/diskann-benchmark-runner/tests/regression/check-verify-4/input.json new file mode 100644 index 000000000..2461536e2 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-4/input.json @@ -0,0 +1,12 @@ +{ + "search_directories": [], + "output_directory": null, + "jobs": [ + { + "type": "test-input-dim", + "content": { + "dim": 128 + } + } + ] +} diff --git a/diskann-benchmark-runner/tests/regression/check-verify-4/stdin.txt b/diskann-benchmark-runner/tests/regression/check-verify-4/stdin.txt new file mode 100644 index 000000000..b41ed7067 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-4/stdin.txt @@ -0,0 +1,3 @@ +# The tolerance is test-input-dim-tolerance but the input tag is test-input-types. +# These are incompatible — resolve() should reject this with a clear error. +check verify --tolerances $TOLERANCES --input-file $INPUT diff --git a/diskann-benchmark-runner/tests/regression/check-verify-4/stdout.txt b/diskann-benchmark-runner/tests/regression/check-verify-4/stdout.txt new file mode 100644 index 000000000..d496bcedb --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-4/stdout.txt @@ -0,0 +1,4 @@ +while processing tolerance input 1 of 1 + +Caused by: + input tag "test-input-types" is not compatible with tolerance tag "test-input-dim-tolerance". Valid input tags are: ["test-input-dim"] \ No newline at end of file diff --git a/diskann-benchmark-runner/tests/regression/check-verify-4/tolerances.json b/diskann-benchmark-runner/tests/regression/check-verify-4/tolerances.json new file mode 100644 index 000000000..995bd9848 --- /dev/null +++ b/diskann-benchmark-runner/tests/regression/check-verify-4/tolerances.json @@ -0,0 +1,17 @@ +{ + "checks": [ + { + "input": { + "type": "test-input-types", + "content": {} + }, + "tolerance": { + "type": "test-input-dim-tolerance", + "content": { + "succeed": true, + "error_in_check": false + } + } + } + ] +} diff --git a/diskann-benchmark-simd/examples/tolerance.json b/diskann-benchmark-simd/examples/tolerance.json new file mode 100644 index 000000000..7859e8760 --- /dev/null +++ b/diskann-benchmark-simd/examples/tolerance.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "simd-op", + "content": {} + }, + "tolerance": { + "type": "simd-tolerance", + "content": { + "min_time_regression": 0.05 + } + } + } + ] +} diff --git a/diskann-benchmark-simd/src/bin.rs b/diskann-benchmark-simd/src/bin.rs index 50efff351..8179feb68 100644 --- a/diskann-benchmark-simd/src/bin.rs +++ b/diskann-benchmark-simd/src/bin.rs @@ -34,7 +34,7 @@ mod tests { use std::path::{Path, PathBuf}; - use diskann_benchmark_runner::app::Commands; + use diskann_benchmark_runner::app::{Check, Commands}; fn run_integration_test(input_file: &Path, output_file: &Path) { let commands = Commands::Run { @@ -55,6 +55,19 @@ mod tests { assert!(output_file.exists()); } + fn run_check_test(input_file: &Path, tolerances: &Path) -> String { + let commands = Commands::Check(Check::Verify { + tolerances: tolerances.to_str().unwrap().into(), + input_file: input_file.to_str().unwrap().into(), + }); + + let app = App::from_commands(commands); + + let mut output = output::Memory::new(); + main_inner(&app, &mut output).unwrap(); + String::from_utf8(output.into_inner()).unwrap() + } + #[test] fn integration_test() { let input = if cfg!(target_arch = "x86_64") { @@ -74,4 +87,17 @@ mod tests { run_integration_test(&input_path, &output_path); } + + #[test] + fn check_verify() { + let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("simd-scalar.json"); + let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("tolerance.json"); + + let stdout = run_check_test(&input_path, &tolerance_path); + println!("stdout = {}", stdout); + } } diff --git a/diskann-benchmark-simd/src/lib.rs b/diskann-benchmark-simd/src/lib.rs index 1336933e4..4fb921590 100644 --- a/diskann-benchmark-simd/src/lib.rs +++ b/diskann-benchmark-simd/src/lib.rs @@ -3,6 +3,8 @@ * Licensed under the MIT license. */ +//! SIMD distance kernel benchmarks with regression detection. + use std::{io::Write, num::NonZeroUsize}; use diskann_utils::views::{Matrix, MatrixView}; @@ -18,10 +20,12 @@ use serde::{Deserialize, Serialize}; use thiserror::Error; use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, describeln, dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, utils::{ datatype::{self, DataType}, + num::{relative_change, NonNegativeFinite}, percentiles, MicroSeconds, }, Any, Benchmark, CheckDeserialization, Checker, Input, @@ -74,7 +78,7 @@ impl std::fmt::Display for SimilarityMeasure { #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] -pub(crate) enum Arch { +enum Arch { #[serde(rename = "x86-64-v4")] #[allow(non_camel_case_types)] X86_64_V4, @@ -99,21 +103,21 @@ impl std::fmt::Display for Arch { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct Run { - pub(crate) distance: SimilarityMeasure, - pub(crate) dim: NonZeroUsize, - pub(crate) num_points: NonZeroUsize, - pub(crate) loops_per_measurement: NonZeroUsize, - pub(crate) num_measurements: NonZeroUsize, +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct Run { + distance: SimilarityMeasure, + dim: NonZeroUsize, + num_points: NonZeroUsize, + loops_per_measurement: NonZeroUsize, + num_measurements: NonZeroUsize, } #[derive(Debug, Serialize, Deserialize)] pub struct SimdOp { - pub(crate) query_type: DataType, - pub(crate) data_type: DataType, - pub(crate) arch: Arch, - pub(crate) runs: Vec, + query_type: DataType, + data_type: DataType, + arch: Arch, + runs: Vec, } impl CheckDeserialization for SimdOp { @@ -198,6 +202,103 @@ impl Input for SimdOp { } } +////////////////////// +// Regression Check // +////////////////////// + +/// Tolerance thresholds for SIMD benchmark regression detection. +/// +/// Each field specifies the maximum allowed relative increase in the corresponding metric. +/// For example, a value of `0.10` means a 10% increase is tolerated. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +struct SimdTolerance { + min_time_regression: NonNegativeFinite, +} + +impl CheckDeserialization for SimdTolerance { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Input for SimdTolerance { + fn tag() -> &'static str { + "simd-tolerance" + } + + fn try_deserialize( + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(Self::deserialize(serialized)?) + } + + fn example() -> anyhow::Result { + const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.10) { + Ok(v) => v, + Err(_) => panic!("use a non-negative finite please"), + }; + + Ok(serde_json::to_value(SimdTolerance { + min_time_regression: EXAMPLE, + })?) + } +} + +/// Per-run comparison result showing before/after percentile differences. +#[derive(Debug, Serialize)] +struct Comparison { + run: Run, + tolerance: SimdTolerance, + before_min: f64, + after_min: f64, +} + +/// Aggregated result of the regression check across all runs. +#[derive(Debug, Serialize)] +struct CheckResult { + checks: Vec, +} + +impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Distance", + "Dim", + "Min Before (ns)", + "Min After (ns)", + "Change (%)", + "Remark", + ]; + + let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.distance, 0); + row.insert(c.run.dim, 1); + row.insert(format!("{:.3}", c.before_min), 2); + row.insert(format!("{:.3}", c.after_min), 3); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 4); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 5); + } + } + Err(err) => { + row.insert("invalid", 4); + row.insert(err, 5); + } + } + } + + table.fmt(f) + } +} + //////////////////////////// // Benchmark Registration // //////////////////////////// @@ -205,10 +306,10 @@ impl Input for SimdOp { macro_rules! register { ($arch:literal, $dispatcher:ident, $name:literal, $($kernel:tt)*) => { #[cfg(target_arch = $arch)] - $dispatcher.register::<$($kernel)*>($name) + $dispatcher.register_regression::<$($kernel)*>($name) }; ($dispatcher:ident, $name:literal, $($kernel:tt)*) => { - $dispatcher.register::<$($kernel)*>($name) + $dispatcher.register_regression::<$($kernel)*>($name) }; } @@ -553,6 +654,72 @@ where } } +impl Regression for Kernel +where + datatype::Type: DispatchRule, + datatype::Type: DispatchRule, + Identity: DispatchRule, + Kernel: RunBenchmark, +{ + type Tolerances = SimdTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + tolerance: &SimdTolerance, + _input: &SimdOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + + let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + // Determine whether or not we pass. + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + let check = CheckResult { checks }; + + if passed { + Ok(PassFail::Pass(check)) + } else { + Ok(PassFail::Fail(check)) + } + } +} + /////////////// // Benchmark // /////////////// @@ -561,9 +728,9 @@ trait RunBenchmark { fn run(self, input: &SimdOp) -> Result, anyhow::Error>; } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] struct RunResult { - /// The setup + /// The configuration for this run. run: Run, /// The latencies of individual runs. latencies: Vec, @@ -571,6 +738,12 @@ struct RunResult { percentiles: percentiles::Percentiles, } +impl RunResult { + fn computations_per_latency(&self) -> usize { + self.run.num_points.get() * self.run.loops_per_measurement.get() + } +} + impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.is_empty() { @@ -600,8 +773,7 @@ impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { .unwrap_or(MicroSeconds::new(u64::MAX)); let mean_latency = r.percentiles.mean; - let computations_per_latency: f64 = - (r.run.num_points.get() * r.run.loops_per_measurement.get()) as f64; + let computations_per_latency = r.computations_per_latency() as f64; // Convert time from micro-seconds to nano-seconds. let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; @@ -1012,3 +1184,154 @@ mod reference { (r.xy / (r.xnorm.sqrt() * r.ynorm.sqrt())).clamp(-1.0, 1.0) } } + +/////////// +// Tests // +/////////// + +#[cfg(test)] +mod tests { + use super::*; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::percentiles::compute_percentiles, + }; + + fn tiny_run(distance: SimilarityMeasure) -> Run { + Run { + distance, + dim: NonZeroUsize::new(8).unwrap(), + num_points: NonZeroUsize::new(1).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> SimdOp { + SimdOp { + query_type: DataType::Float32, + data_type: DataType::Float32, + arch: Arch::Scalar, + runs: vec![tiny_run(SimilarityMeasure::SquaredL2)], + } + } + + fn tiny_result(distance: SimilarityMeasure, minimum: u64) -> RunResult { + let run = tiny_run(distance); + let minimum = MicroSeconds::new(minimum); + let mut latencies = vec![minimum]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run, + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> SimdTolerance { + SimdTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + type Bench = Kernel; + + let err = Bench::check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(SimilarityMeasure::SquaredL2, 100)], + &vec![tiny_result(SimilarityMeasure::Cosine, 100)], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + type Bench = Kernel; + + let result = Bench::check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(SimilarityMeasure::SquaredL2, 100)], + &vec![tiny_result(SimilarityMeasure::SquaredL2, 95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + type Bench = Kernel; + + let result = Bench::check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(SimilarityMeasure::SquaredL2, 100)], + &vec![tiny_result(SimilarityMeasure::SquaredL2, 105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + type Bench = Kernel; + + let result = Bench::check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(SimilarityMeasure::SquaredL2, 100)], + &vec![tiny_result(SimilarityMeasure::SquaredL2, 106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(SimilarityMeasure::SquaredL2), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Distance"), "rendered = {rendered}"); + assert!(rendered.contains("squared_l2"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + // If a "before" value is 0, we should fail with an error because this means the + // measurement was too fast for us to obtain a reliable signal, so we *could* be letting + // a regression through. + // + // We require at least a non-zero value. + #[test] + fn zero_values_rejected() { + type Bench = Kernel; + + let result = Bench::check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(SimilarityMeasure::SquaredL2, 0)], + &vec![tiny_result(SimilarityMeasure::SquaredL2, 0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } +}