Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
[dev-dependencies]
quickcheck = "0.7"
criterion = "0.5"
proptest = "1.7.0"

[[bench]]
name = "chars"
Expand All @@ -36,3 +37,8 @@ harness = false
[[bench]]
name = "word_bounds"
harness = false

[[bench]]
name = "unicode_word_indices"
harness = false

4 changes: 2 additions & 2 deletions benches/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
1 change: 1 addition & 0 deletions benches/texts/log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
37 changes: 37 additions & 0 deletions benches/unicode_word_indices.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"log", //"arabic",
"english",
//"hindi",
"japanese",
//"korean",
//"mandarin",
//"russian",
//"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_word_indices() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("unicode_word_indices");

for file in FILES {
let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
group.throughput(criterion::Throughput::Bytes(input.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
b.iter(|| grapheme(content))
});
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion benches/word_bounds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
Expand Down
4 changes: 2 additions & 2 deletions benches/words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
21 changes: 13 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,16 @@
)]
#![no_std]

#[cfg(test)]
extern crate std;

pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
pub use word::{UWordBoundIndices, UWordBounds};
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this change made?

Copy link
Copy Markdown
Contributor

@PSeitz PSeitz Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, that shouldn't be there. That's left over from a reverted refactoring.


use crate::word::{UnicodeWordIndices, UnicodeWords};

mod grapheme;
mod sentence;
Expand Down Expand Up @@ -248,7 +253,7 @@ pub trait UnicodeSegmentation {

impl UnicodeSegmentation for str {
#[inline]
fn graphemes(&self, is_extended: bool) -> Graphemes {
fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
grapheme::new_graphemes(self, is_extended)
}

Expand All @@ -258,32 +263,32 @@ impl UnicodeSegmentation for str {
}

#[inline]
fn unicode_words(&self) -> UnicodeWords {
fn unicode_words(&self) -> UnicodeWords<'_> {
word::new_unicode_words(self)
}

#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
word::new_unicode_word_indices(self)
}

#[inline]
fn split_word_bounds(&self) -> UWordBounds {
fn split_word_bounds(&self) -> UWordBounds<'_> {
word::new_word_bounds(self)
}

#[inline]
fn split_word_bound_indices(&self) -> UWordBoundIndices {
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
fn unicode_sentences(&self) -> UnicodeSentences<'_> {
sentence::new_unicode_sentences(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
sentence::new_sentence_bounds(self)
}

Expand Down
Loading