-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
sort: add locale-aware month sorting (-M) #11445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ | |
| // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html | ||
|
|
||
| // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv foldhash | ||
| // spell-checker:ignore (misc) uppercased qsort getmonth juin juil | ||
|
|
||
| mod buffer_hint; | ||
| mod check; | ||
|
|
@@ -42,13 +43,15 @@ use std::os::unix::ffi::OsStrExt; | |
| use std::path::Path; | ||
| use std::path::PathBuf; | ||
| use std::str::Utf8Error; | ||
| use std::sync::OnceLock; | ||
| use thiserror::Error; | ||
| use uucore::display::Quotable; | ||
| use uucore::error::{FromIo, strip_errno}; | ||
| use uucore::error::{UError, UResult, USimpleError, UUsageError}; | ||
| use uucore::extendedbigdecimal::ExtendedBigDecimal; | ||
| #[cfg(feature = "i18n-collator")] | ||
| use uucore::i18n::collator::{compute_sort_key_utf8, locale_cmp}; | ||
| use uucore::i18n::datetime::get_locale_months; | ||
| use uucore::i18n::decimal::locale_decimal_separator; | ||
| use uucore::line_ending::LineEnding; | ||
| use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; | ||
|
|
@@ -787,26 +790,20 @@ impl<'a> Line<'a> { | |
| } | ||
| SortMode::Month => { | ||
| let initial_selection = &self.line[selection.clone()]; | ||
|
|
||
| let mut month_chars = initial_selection | ||
| let trimmed = initial_selection | ||
| .iter() | ||
| .enumerate() | ||
| .skip_while(|(_, c)| c.is_ascii_whitespace()); | ||
| .position(|c| !c.is_ascii_whitespace()) | ||
| .unwrap_or(initial_selection.len()); | ||
|
|
||
| let (parsed, match_len) = month_parse(initial_selection); | ||
|
|
||
| let month = if month_parse(initial_selection) == Month::Unknown { | ||
| let month = if parsed == Month::Unknown { | ||
| // We failed to parse a month, which is equivalent to matching nothing. | ||
| // Add the "no match for key" marker to the first non-whitespace character. | ||
| let first_non_whitespace = month_chars.next(); | ||
| first_non_whitespace.map_or( | ||
| initial_selection.len()..initial_selection.len(), | ||
| |(idx, _)| idx..idx, | ||
| ) | ||
| trimmed..trimmed | ||
| } else { | ||
| // We parsed a month. Match the first three non-whitespace characters, which must be the month we parsed. | ||
| month_chars.next().unwrap().0 | ||
| ..month_chars | ||
| .nth(2) | ||
| .map_or(initial_selection.len(), |(idx, _)| idx) | ||
| // We parsed a month. Use the actual match byte length. | ||
| trimmed..(trimmed + match_len) | ||
|
Comment on lines
+803
to
+806
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might make sense to set |
||
| }; | ||
|
|
||
| // Shorten selection to month. | ||
|
|
@@ -3017,30 +3014,122 @@ enum Month { | |
| December, | ||
| } | ||
|
|
||
| /// Cached locale month lookup table. | ||
| /// Each entry is (uppercased_blank_stripped_name, month_value). | ||
| type MonthTable = Vec<(Vec<u8>, Month)>; | ||
|
|
||
| fn get_locale_month_table() -> Option<&'static MonthTable> { | ||
| static TABLE: OnceLock<Option<MonthTable>> = OnceLock::new(); | ||
|
|
||
| TABLE | ||
| .get_or_init(|| { | ||
| let months = get_locale_months()?; | ||
| let all_months = [ | ||
| Month::January, | ||
| Month::February, | ||
| Month::March, | ||
| Month::April, | ||
| Month::May, | ||
| Month::June, | ||
| Month::July, | ||
| Month::August, | ||
| Month::September, | ||
| Month::October, | ||
| Month::November, | ||
| Month::December, | ||
| ]; | ||
| let table: Vec<(Vec<u8>, Month)> = months | ||
| .iter() | ||
| .zip(all_months.iter()) | ||
| .map(|(name, &month)| (name.clone(), month)) | ||
| .collect(); | ||
| Some(table) | ||
| }) | ||
| .as_ref() | ||
| } | ||
|
|
||
| /// Parse the beginning string into a Month, returning [`Month::Unknown`] on errors. | ||
| fn month_parse(line: &[u8]) -> Month { | ||
| /// Also returns the byte length consumed from the input (after leading blanks). | ||
| fn month_parse(line: &[u8]) -> (Month, usize) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this function is not entirely correct. Here is an example where I get a different output compared to GNU |
||
| let line = line.trim_ascii_start(); | ||
|
|
||
| // Try locale-specific month names by scanning all entries. | ||
| // We track the longest match to handle cases where one month name is a | ||
| // prefix of another (e.g., "juin" vs "juil." in French). | ||
| if let Some(table) = get_locale_month_table() { | ||
| let mut best_month = Month::Unknown; | ||
| let mut best_consumed: usize = 0; | ||
|
|
||
| for &(ref name, month) in table { | ||
| let mut m_iter = line.iter(); | ||
| let mut n_iter = name.iter(); | ||
| let mut consumed = 0; | ||
| let mut matched = true; | ||
|
|
||
| loop { | ||
| match n_iter.next() { | ||
| None => { | ||
| // Matched the entire month name | ||
| break; | ||
| } | ||
| Some(&n_byte) => { | ||
| // Skip blanks in input (matching GNU behavior) | ||
| let m_byte = loop { | ||
| match m_iter.next() { | ||
| Some(&b) if b.is_ascii_whitespace() => { | ||
| consumed += 1; | ||
| } | ||
| Some(&b) => { | ||
| consumed += 1; | ||
| break b.to_ascii_uppercase(); | ||
| } | ||
| None => { | ||
| // Input exhausted before month name ended. | ||
| break 0; | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| if m_byte != n_byte { | ||
| matched = false; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if matched && consumed > best_consumed { | ||
| best_month = month; | ||
| best_consumed = consumed; | ||
| } | ||
| } | ||
|
|
||
| if best_month != Month::Unknown { | ||
| return (best_month, best_consumed); | ||
| } | ||
| } | ||
|
|
||
| // Fall back to English 3-letter abbreviations | ||
| match line.get(..3).map(<[u8]>::to_ascii_uppercase).as_deref() { | ||
| Some(b"JAN") => Month::January, | ||
| Some(b"FEB") => Month::February, | ||
| Some(b"MAR") => Month::March, | ||
| Some(b"APR") => Month::April, | ||
| Some(b"MAY") => Month::May, | ||
| Some(b"JUN") => Month::June, | ||
| Some(b"JUL") => Month::July, | ||
| Some(b"AUG") => Month::August, | ||
| Some(b"SEP") => Month::September, | ||
| Some(b"OCT") => Month::October, | ||
| Some(b"NOV") => Month::November, | ||
| Some(b"DEC") => Month::December, | ||
| _ => Month::Unknown, | ||
| Some(b"JAN") => (Month::January, 3), | ||
| Some(b"FEB") => (Month::February, 3), | ||
| Some(b"MAR") => (Month::March, 3), | ||
| Some(b"APR") => (Month::April, 3), | ||
| Some(b"MAY") => (Month::May, 3), | ||
| Some(b"JUN") => (Month::June, 3), | ||
| Some(b"JUL") => (Month::July, 3), | ||
| Some(b"AUG") => (Month::August, 3), | ||
| Some(b"SEP") => (Month::September, 3), | ||
| Some(b"OCT") => (Month::October, 3), | ||
| Some(b"NOV") => (Month::November, 3), | ||
| Some(b"DEC") => (Month::December, 3), | ||
| _ => (Month::Unknown, 0), | ||
| } | ||
| } | ||
|
|
||
| fn month_compare(a: &[u8], b: &[u8]) -> Ordering { | ||
| let ma = month_parse(a); | ||
| let mb = month_parse(b); | ||
| let ma = month_parse(a).0; | ||
| let mb = month_parse(b).0; | ||
|
|
||
| ma.cmp(&mb) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name
trimmedis a bit misleading as it contains a position (and not a trimmed string).