diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 6c2f0de..79b7340 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -26,3 +26,7 @@ harness = false [[bench]] name = "eq" harness = false + +[[bench]] +name = "collection" +harness = false diff --git a/bench/benches/collection.rs b/bench/benches/collection.rs new file mode 100644 index 0000000..e461d68 --- /dev/null +++ b/bench/benches/collection.rs @@ -0,0 +1,69 @@ +use ahash::AHashSet; +use bench::*; +use criterion::{ + black_box, criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, +}; +use std::hash::Hash; +use std::str::FromStr; + +const LENGTHS: &[usize] = &[64]; + +fn bench_hashset_inner( + g: &mut BenchmarkGroup<'_, WallTime>, + name: &'static str, + min: usize, + max: usize, + string_array: &[String], + indices: &[usize], +) { + let string_vec: Vec<_> = string_array + .iter() + .map(|s| T::from_str(s).map_err(|_| ()).unwrap()) + .collect(); + let strings: AHashSet<_> = string_array + .iter() + .map(|s| T::from_str(s).map_err(|_| ()).unwrap()) + .collect(); + + let strings = black_box(strings); + let label = format!("{}-len={}-{}", name, min, max); + + g.bench_function(&label, |b| { + b.iter(|| { + for &i in indices.iter() { + let s = &string_vec[i]; + let _ = black_box(strings.contains(s)); + } + }) + }); +} + +#[rustfmt::skip] +fn bench_hashset(c: &mut Criterion) { + let mut group = c.benchmark_group("hashset"); + let count = 1_000_000; + + let mut indices: Vec = (0..count).collect(); + fastrand::shuffle(&mut indices); + let indices_subset = &indices[..1000]; + + for len in LENGTHS { + for min in [0, *len] { + let mut strings = Vec::with_capacity(count); + for _ in 0..count { + strings.push(random_string(min, *len)); + } + bench_hashset_inner::(&mut group, "std", min, *len, &strings, indices_subset); + bench_hashset_inner::(&mut group, "smol_str", min, *len, &strings, indices_subset); + bench_hashset_inner::(&mut group, "compact_str", min, *len, &strings, indices_subset); + bench_hashset_inner::(&mut group, "smartstring", min, *len, &strings, indices_subset); + bench_hashset_inner::>(&mut group, "smallstr", min, *len, &strings, indices_subset); + bench_hashset_inner::(&mut group, "compact_string", min, *len, &strings, indices_subset); + bench_hashset_inner::(&mut group, "cold-string", min, *len, &strings, indices_subset); + } + } + group.finish(); +} + +criterion_group!(benches, bench_hashset); +criterion_main!(benches); diff --git a/bench/memory.py b/bench/memory.py index 1504823..d0e3b82 100644 --- a/bench/memory.py +++ b/bench/memory.py @@ -32,7 +32,7 @@ def main(): for file in sorted(csv_files): xs, ys = read_csv(file) label = os.path.splitext(os.path.basename(file))[0] - plt.plot(xs, ys, label=label, linewidth=3.5, alpha = 0.75) + plt.plot(xs, ys, label=label, linewidth=3.5, alpha = 1.0) plt.xlabel("String Length") plt.ylabel("Memory Usage (bytes)") diff --git a/bench/tests/memory.rs b/bench/tests/memory.rs index 7962075..f3a94ab 100644 --- a/bench/tests/memory.rs +++ b/bench/tests/memory.rs @@ -3,7 +3,7 @@ use bench::*; -use ahash::{HashMap, HashMapExt}; +use ahash::{HashSet, HashSetExt}; use std::alloc::{GlobalAlloc, Layout, System}; use std::cmp::Ord; use std::collections::BTreeMap; @@ -78,10 +78,10 @@ fn test_allocator_memory() { allocator_memory::("cold-string"); } -fn hash_map_workload(min: usize, max: usize) { - let mut strings: HashMap = HashMap::with_capacity(TRIALS); +fn hash_set_workload(min: usize, max: usize) { + let mut strings: HashSet = HashSet::with_capacity(TRIALS); for _ in 0..TRIALS { - strings.insert(random_string(min, max), random_string(min, max)); + strings.insert(random_string(min, max)); } let strings = std::hint::black_box(strings); std::mem::forget(strings); @@ -148,17 +148,8 @@ fn system_memory(name: &str, workload: impl Fn(usize, usize)) { print!("\n"); } -/// Not run automatically. -/// Run with `cargo test test_system_memory --release -- --no-capture --include-ignored` -/// Or specify min,max: -/// ``` -/// cargo test test_system_memory --release -- --no-capture --include-ignored -/// ``` -#[test] -#[rustfmt::skip] -#[ignore] -fn test_system_memory() { - print!("{:CELL_WIDTH$}", format!("{}..={}", 0, size)); } @@ -169,12 +160,49 @@ fn test_system_memory() { print!(" {: ^CELL_WIDTH$} |", ":---:"); } println!(); +} - system_memory("cold-string", hash_map_workload::); - system_memory("compact_str", hash_map_workload::); - system_memory("compact_string", hash_map_workload::); - system_memory("smallstr", hash_map_workload::>); - system_memory("smartstring", hash_map_workload::); - system_memory("smol_str", hash_map_workload::); - system_memory("std", hash_map_workload::); +/// `cargo test test_system_memory_vec --release -- --no-capture --include-ignored` +#[test] +#[rustfmt::skip] +#[ignore] +fn test_system_memory_vec() { + print_table_header("Vec"); + system_memory("cold-string", vec_workload::); + system_memory("compact_str", vec_workload::); + system_memory("compact_string", vec_workload::); + system_memory("smallstr", vec_workload::>); + system_memory("smartstring", vec_workload::); + system_memory("smol_str", vec_workload::); + system_memory("std", vec_workload::); +} + +/// `cargo test test_system_memory_hashset --release -- --no-capture --include-ignored` +#[test] +#[rustfmt::skip] +#[ignore] +fn test_system_memory_hashset() { + print_table_header("HashSet"); + system_memory("cold-string", hash_set_workload::); + system_memory("compact_str", hash_set_workload::); + system_memory("compact_string", hash_set_workload::); + system_memory("smallstr", hash_set_workload::>); + system_memory("smartstring", hash_set_workload::); + system_memory("smol_str", hash_set_workload::); + system_memory("std", hash_set_workload::); +} + +/// `cargo test test_system_memory_btreeset --release -- --no-capture --include-ignored` +#[test] +#[rustfmt::skip] +#[ignore] +fn test_system_memory_btreeset() { + print_table_header("BTreeSet"); + system_memory("cold-string", btree_workload::); + system_memory("compact_str", btree_workload::); + system_memory("compact_string", btree_workload::); + system_memory("smallstr", btree_workload::>); + system_memory("smartstring", btree_workload::); + system_memory("smol_str", btree_workload::); + system_memory("std", btree_workload::); } diff --git a/cold-string/README.md b/cold-string/README.md index 4badad7..16969fb 100644 --- a/cold-string/README.md +++ b/cold-string/README.md @@ -3,37 +3,27 @@ [![Crates.io](https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust)](https://crates.io/crates/cold-string) [![docs.rs](https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs)](https://docs.rs/cold-string) ![MSRV](https://img.shields.io/crates/msrv/cold-string?style=for-the-badge) -![Downloads](https://img.shields.io/crates/d/cold-string?style=for-the-badge) A 1-word (8-byte) sized representation of immutable UTF-8 strings that in-lines up to 8 bytes. Optimized for memory usage and struct packing. -# Overview +## Overview -`ColdString` is optimized for memory efficiency for **large** and **short** strings: -- 0..=8 bytes: always 8 bytes total (fully inlined). -- 9..=128 bytes: 8-byte pointer + 1-byte length encoding -- 129..=16384 bytes: 8-byte pointer + 2-byte length encoding -- Continues logarithmically up to 18 bytes overhead for sizes up to `isize::MAX`. +`ColdString` minimizes per-string overhead for both **short and large** strings. +- Strings ≤ 8 bytes: **8 bytes total** +- Larger strings: **~9–10 bytes overhead** (other string libraries have 24 bytes per value) -Compared to `String`, which stores capacity and length inline (3 machine words), `ColdString` avoids storing length inline for heap strings and compresses metadata into tagged pointer space. This leads to substantial memory savings in benchmarks (see [Memory Comparison (System RSS)](#memory-comparison-system-rss)): -- **36% – 68%** smaller than `String` in `HashMap` -- **28% – 65%** smaller than other short-string crates in `HashMap` +This leads to substantial memory savings over both `String` and other short-string crates (see [Memory Comparison (System RSS)](#memory-comparison-system-rss)): +- **35% – 67%** smaller than `String` in `HashSet` +- **35% – 64%** smaller than other short-string crates in `HashSet` - **30% – 75%** smaller than `String` in `BTreeSet` - **13% – 63%** smaller than other short-string crates in `BTreeSet` -`ColdString`'s MSRV is 1.60, is `no_std` compatible, and is a drop in replacement for immutable Strings. - -### Safety -`ColdString` is written using [Rust's strict provenance API](https://doc.rust-lang.org/beta/std/ptr/index.html#strict-provenance), carefully handles unaligned access internally, and is validated with property testing and MIRI. +--- -### Why "Cold"? - -The heap representation stores the length on the heap, not inline in the struct. This saves memory in the struct itself but *slightly* increases the cost of `len()` since it requires a heap read. In practice, the `len()` cost is only marginally slower than inline storage and is typically negligible compared to: -- Memory savings -- Cache density improvements -- Faster collection operations due to reduced footprint +### Portability +`ColdString`'s MSRV is 1.60, is `no_std` compatible, and is a drop in replacement for immutable Strings. -# Usage +## Usage Use it like a `String`: ```rust @@ -45,57 +35,54 @@ assert_eq!(s.as_str(), "qwerty"); Packs well with other types: ```rust -use std::mem; use cold_string::ColdString; +use std::mem::{align_of, size_of}; -assert_eq!(mem::size_of::(), mem::size_of::()); -assert_eq!(mem::align_of::(), 1); +assert_eq!(size_of::(), size_of::()); +assert_eq!(align_of::(), 1); -assert_eq!(mem::size_of::<(ColdString, u8)>(), mem::size_of::() + 1); -assert_eq!(mem::align_of::<(ColdString, u8)>(), 1); +assert_eq!(size_of::<(ColdString, u8)>(), size_of::() + 1); +assert_eq!(size_of::>(), size_of::() + 1); ``` -# How It Works +## How It Works -ColdString is 8-byte tagged pointer (4 bytes on 32-bit machines): +ColdString is an 8-byte tagged pointer (4 bytes on 32-bit machines): ```rust #[repr(packed)] pub struct ColdString { - /// The first byte of `encoded` is the "tag" and it determines the type: - /// - 10xxxxxx: an encoded address for the heap. To decode, 10 is set to 00 and swapped - /// with the LSB bits of the tag byte. The address is always a multiple of 4 (`HEAP_ALIGN`). - /// - 11111xxx: xxx is the length in range 0..=7, followed by length UTF-8 bytes. - /// - xxxxxxxx (valid UTF-8): 8 UTF-8 bytes. encoded: *mut u8, } ``` -`encoded` acts as either a pointer to the heap for strings longer than 8 bytes or is the inlined data itself. The first/"tag" byte indicates one of 3 encodings: +The 8 bytes encode one of three representations indicated by the 1st byte: +- `10xxxxxx`: `encoded` contains a tagged heap pointer. To decode the address, clear the tag bits (`10 → 00`) and rotate so the `00` bits become the least-significant bits. The heap allocation uses [4-byte alignment](https://doc.rust-lang.org/beta/std/alloc/struct.Layout.html#method.from_size_align), guaranteeing the +least-significant 2 bits of the address are `00`. On the heap, the UTF-8 characters are preceded by the variable-length encoding of the size. The size uses 1 byte for 0 - 127, 2 bytes for 128 - 16383, etc. +- `11111xxx`: xxx is the length and the remaining 0-7 bytes are UTF-8 characters. +- `xxxxxxxx`: All 8 bytes are UTF-8. -### Inline Mode (0 to 7 Bytes) -The tag byte has bits 11111xxx, where xxx is the length. `self.0[1]` to `self.0[7]` store the bytes of string. +`10xxxxxx` and `11111xxx` are chosen because they cannot be valid first bytes of UTF-8. -### Inline Mode (8 Bytes) -The tag byte is any valid UTF-8 byte. `self.0` stores the bytes of string. Since the string is UTF-8, the tag byte is guaranteed to not be 10xxxxx or 11111xxx. +### Why "Cold"? -### Heap Mode -`self.0` encodes the pointer to heap, where tag byte is 10xxxxxx. 10xxxxxx is chosen because it's a UTF-8 continuation byte and therefore an impossible tag byte for inline mode. Since a heap-alignment of 4 is chosen, the pointer's least significant 2 bits are guaranteed to be 0 ([See more](https://doc.rust-lang.org/beta/std/alloc/struct.Layout.html#method.from_size_align)). These bits are swapped with the 10 "tag" bits when de/coding between `self.0` and the address value. +The heap representation stores the length on the heap, not inline in the struct. This saves memory in the struct itself but *slightly* increases the cost of `len()` since it requires a heap read. In practice, the `len()` cost is only marginally slower than inline storage and is typically negligible compared to memory savings, cache density improvements, and 3x faster operations on inlined strings. -On the heap, the data starts with a variable length integer encoding of the length, followed by the bytes. -```text,ignore -ptr --> -``` +### Safety -# Memory Comparisons (Allocator) +`ColdString` uses `unsafe` to implement its packed representation and pointer tagging. Usage of `unsafe` is narrowly scoped to where layout control is required, and each instance is documented with `// SAFETY: `. To further ensure soundness, `ColdString` is written using [Rust's strict provenance API](https://doc.rust-lang.org/beta/std/ptr/index.html#strict-provenance), handles unaligned access internally, maintains explicit heap alignment guarantees, and is validated with property testing and MIRI. + +## Benchmarks + +### Memory Comparisons (Allocator) Memory usage per string, measured by tracking the memory requested by the allocator: ![string_memory](https://github.com/user-attachments/assets/adf09756-9910-4618-a97f-b5ab91a2515a) -## Memory Comparison (System RSS) +### Memory Comparison (System RSS) -RSS per insertion of various collections containing strings of random lengths 0..=N: +Resident set size in bytes per insertion of various collections. Insertions are strings with random length 0..=N: -Vec | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 +Vec | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 :--- | :---: | :---: | :---: | :---: | :---: | cold-string | 8.0 | 8.0 | 23.2 | 33.7 | 53.4 compact_str | 24.0 | 24.0 | 24.0 | 34.6 | 60.6 @@ -105,17 +92,17 @@ smartstring | 24.0 | 24.0 | 24.0 | 40.4 | 65.4 smol_str | 24.0 | 24.0 | 24.0 | 39.9 | 71.2 std | 35.8 | 37.4 | 45.8 | 54.2 | 70.5 -HashMap | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 +HashSet | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 :--- | :---: | :---: | :---: | :---: | :---: | -cold-string | 35.7 | 35.7 | 63.3 | 88.2 | 125.1 -compact_str | 102.8 | 102.8 | 102.8 | 123.7 | 175.5 -compact_string | 45.4 | 59.6 | 78.2 | 97.1 | 130.1 -smallstr | 102.8 | 102.8 | 129.7 | 155.0 | 191.6 -smartstring | 102.8 | 102.8 | 102.8 | 135.9 | 185.8 -smol_str | 102.8 | 102.8 | 102.8 | 134.8 | 196.6 -std | 112.8 | 123.9 | 143.2 | 161.8 | 195.3 - -B-Tree Set | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 +cold-string | 18.9 | 18.9 | 34.5 | 45.5 | 64.0 +compact_str | 52.4 | 52.4 | 52.4 | 62.2 | 88.9 +compact_string | 23.2 | 30.0 | 39.6 | 49.1 | 65.9 +smallstr | 52.4 | 52.4 | 66.5 | 78.6 | 96.9 +smartstring | 52.4 | 52.4 | 52.4 | 68.2 | 94.0 +smol_str | 52.4 | 52.4 | 52.4 | 68.3 | 99.4 +std | 56.8 | 61.9 | 72.2 | 81.7 | 98.5 + +BTreeSet | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 :--- | :---: | :---: | :---: | :---: | :---: | cold-string | 10.1 | 18.9 | 49.3 | 79.1 | 117.2 compact_str | 24.8 | 48.4 | 61.5 | 90.5 | 145.7 @@ -125,10 +112,8 @@ smartstring | 24.5 | 48.6 | 61.1 | 102.3 | 155.8 smol_str | 25.0 | 48.3 | 61.6 | 100.7 | 166.7 std | 35.8 | 70.4 | 102.9 | 128.9 | 165.5 -**Note:** Columns represent string length (bytes/chars). Values represent average Resident Set Size (RSS) in bytes per string instance. Measurements taken with 10M iterations. - -## Speed -### Construction: Variable Length (0..=N) [ns/op] +### Speed +#### Construction: Variable Length (0..=N) [ns/op] Crate | 0..=4 | 0..=8 | 0..=16 | 0..=32 | 0..=64 :--- | :---: | :---: | :---: | :---: | :---: cold-string | 10.0 | 9.2 | 25.3 | 30.0 | 37.2 @@ -139,7 +124,7 @@ smartstring | 14.8 | 15.1 | 15.0 | 26.9 | 4 smol_str | 19.2 | 19.8 | 20.1 | 23.4 | 33.7 std | 28.6 | 31.4 | 34.9 | 32.0 | 33.1 -### Construction: Fixed Length (N..=N) [ns/op] +#### Construction: Fixed Length (N..=N) [ns/op] Crate | 4..=4 | 8..=8 | 16..=16 | 32..=32 | 64..=64 :--- | :---: | :---: | :---: | :---: | :---: cold-string | 6.5 | 4.2 | 34.2 | 34.3 | 36.2