diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index c648763764f..d9bc86a687e 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -42,6 +42,7 @@ uucore = { workspace = true, features = [ "version-cmp", "i18n-decimal", "i18n-collator", + "i18n-datetime", ] } fluent = { workspace = true } foldhash = { workspace = true } @@ -61,6 +62,7 @@ uucore = { workspace = true, features = [ "parser-size", "version-cmp", "i18n-collator", + "i18n-datetime", ] } [[bin]] diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index c87025a509d..c19c01126f9 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -8,6 +8,7 @@ // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv foldhash +// spell-checker:ignore (misc) uppercased qsort getmonth juin juil mod buffer_hint; mod check; @@ -42,6 +43,7 @@ use std::os::unix::ffi::OsStrExt; use std::path::Path; use std::path::PathBuf; use std::str::Utf8Error; +use std::sync::OnceLock; use thiserror::Error; use uucore::display::Quotable; use uucore::error::{FromIo, strip_errno}; @@ -49,6 +51,7 @@ use uucore::error::{UError, UResult, USimpleError, UUsageError}; use uucore::extendedbigdecimal::ExtendedBigDecimal; #[cfg(feature = "i18n-collator")] use uucore::i18n::collator::{compute_sort_key_utf8, locale_cmp}; +use uucore::i18n::datetime::get_locale_months; use uucore::i18n::decimal::locale_decimal_separator; use uucore::line_ending::LineEnding; use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; @@ -787,26 +790,20 @@ impl<'a> Line<'a> { } SortMode::Month => { let initial_selection = &self.line[selection.clone()]; - - let mut month_chars = initial_selection + let trimmed = initial_selection .iter() - .enumerate() - .skip_while(|(_, c)| c.is_ascii_whitespace()); + .position(|c| !c.is_ascii_whitespace()) + .unwrap_or(initial_selection.len()); + + let (parsed, match_len) = month_parse(initial_selection); - let month = if month_parse(initial_selection) == Month::Unknown { + let month = if parsed == Month::Unknown { // We failed to parse a month, which is equivalent to matching nothing. // Add the "no match for key" marker to the first non-whitespace character. - let first_non_whitespace = month_chars.next(); - first_non_whitespace.map_or( - initial_selection.len()..initial_selection.len(), - |(idx, _)| idx..idx, - ) + trimmed..trimmed } else { - // We parsed a month. Match the first three non-whitespace characters, which must be the month we parsed. - month_chars.next().unwrap().0 - ..month_chars - .nth(2) - .map_or(initial_selection.len(), |(idx, _)| idx) + // We parsed a month. Use the actual match byte length. + trimmed..(trimmed + match_len) }; // Shorten selection to month. @@ -3017,30 +3014,122 @@ enum Month { December, } +/// Cached locale month lookup table. +/// Each entry is (uppercased_blank_stripped_name, month_value). +type MonthTable = Vec<(Vec, Month)>; + +fn get_locale_month_table() -> Option<&'static MonthTable> { + static TABLE: OnceLock> = OnceLock::new(); + + TABLE + .get_or_init(|| { + let months = get_locale_months()?; + let all_months = [ + Month::January, + Month::February, + Month::March, + Month::April, + Month::May, + Month::June, + Month::July, + Month::August, + Month::September, + Month::October, + Month::November, + Month::December, + ]; + let table: Vec<(Vec, Month)> = months + .iter() + .zip(all_months.iter()) + .map(|(name, &month)| (name.clone(), month)) + .collect(); + Some(table) + }) + .as_ref() +} + /// Parse the beginning string into a Month, returning [`Month::Unknown`] on errors. -fn month_parse(line: &[u8]) -> Month { +/// Also returns the byte length consumed from the input (after leading blanks). +fn month_parse(line: &[u8]) -> (Month, usize) { let line = line.trim_ascii_start(); + // Try locale-specific month names by scanning all entries. + // We track the longest match to handle cases where one month name is a + // prefix of another (e.g., "juin" vs "juil." in French). + if let Some(table) = get_locale_month_table() { + let mut best_month = Month::Unknown; + let mut best_consumed: usize = 0; + + for &(ref name, month) in table { + let mut m_iter = line.iter(); + let mut n_iter = name.iter(); + let mut consumed = 0; + let mut matched = true; + + loop { + match n_iter.next() { + None => { + // Matched the entire month name + break; + } + Some(&n_byte) => { + // Skip blanks in input (matching GNU behavior) + let m_byte = loop { + match m_iter.next() { + Some(&b) if b.is_ascii_whitespace() => { + consumed += 1; + } + Some(&b) => { + consumed += 1; + break b.to_ascii_uppercase(); + } + None => { + // Input exhausted before month name ended. + break 0; + } + } + }; + + if m_byte != n_byte { + matched = false; + break; + } + } + } + } + + if matched && consumed > best_consumed { + best_month = month; + best_consumed = consumed; + } + } + + if best_month != Month::Unknown { + return (best_month, best_consumed); + } + } + + // Fall back to English 3-letter abbreviations match line.get(..3).map(<[u8]>::to_ascii_uppercase).as_deref() { - Some(b"JAN") => Month::January, - Some(b"FEB") => Month::February, - Some(b"MAR") => Month::March, - Some(b"APR") => Month::April, - Some(b"MAY") => Month::May, - Some(b"JUN") => Month::June, - Some(b"JUL") => Month::July, - Some(b"AUG") => Month::August, - Some(b"SEP") => Month::September, - Some(b"OCT") => Month::October, - Some(b"NOV") => Month::November, - Some(b"DEC") => Month::December, - _ => Month::Unknown, + Some(b"JAN") => (Month::January, 3), + Some(b"FEB") => (Month::February, 3), + Some(b"MAR") => (Month::March, 3), + Some(b"APR") => (Month::April, 3), + Some(b"MAY") => (Month::May, 3), + Some(b"JUN") => (Month::June, 3), + Some(b"JUL") => (Month::July, 3), + Some(b"AUG") => (Month::August, 3), + Some(b"SEP") => (Month::September, 3), + Some(b"OCT") => (Month::October, 3), + Some(b"NOV") => (Month::November, 3), + Some(b"DEC") => (Month::December, 3), + _ => (Month::Unknown, 0), } } fn month_compare(a: &[u8], b: &[u8]) -> Ordering { - let ma = month_parse(a); - let mb = month_parse(b); + let ma = month_parse(a).0; + let mb = month_parse(b).0; ma.cmp(&mb) } diff --git a/src/uucore/src/lib/features/i18n/datetime.rs b/src/uucore/src/lib/features/i18n/datetime.rs index 88816d9daed..c79b4426fa9 100644 --- a/src/uucore/src/lib/features/i18n/datetime.rs +++ b/src/uucore/src/lib/features/i18n/datetime.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore fieldsets prefs febr +// spell-checker:ignore fieldsets prefs febr abmon langinfo uppercased //! Locale-aware datetime formatting utilities using ICU and jiff-icu @@ -137,10 +137,165 @@ pub fn localize_format_string(format: &str, date: JiffDate) -> String { fmt.replace(PERCENT_PLACEHOLDER, "%%") } +/// Abbreviated month names for the current LC_TIME locale as raw bytes, +/// with blanks stripped and uppercased using ASCII case folding. +/// +/// Each entry corresponds to months January (index 0) through December (index 11). +/// Returns `None` for C/POSIX locale (caller should use English defaults). +/// This matches the GNU coreutils approach of storing uppercased, blank-stripped names. +pub fn get_locale_months() -> Option<&'static [Vec; 12]> { + static LOCALE_MONTHS: OnceLock; 12]>> = OnceLock::new(); + + LOCALE_MONTHS + .get_or_init(|| { + if !should_use_icu_locale() { + return None; + } + get_locale_months_inner() + }) + .as_ref() +} + +/// Unix implementation using nl_langinfo for exact match with `locale abmon` output. +#[cfg(any( + target_os = "linux", + target_vendor = "apple", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly" +))] +fn get_locale_months_inner() -> Option<[Vec; 12]> { + use nix::libc; + use std::ffi::CStr; + + let abmon_items: [libc::nl_item; 12] = [ + libc::ABMON_1, + libc::ABMON_2, + libc::ABMON_3, + libc::ABMON_4, + libc::ABMON_5, + libc::ABMON_6, + libc::ABMON_7, + libc::ABMON_8, + libc::ABMON_9, + libc::ABMON_10, + libc::ABMON_11, + libc::ABMON_12, + ]; + + // SAFETY: setlocale and nl_langinfo are standard POSIX functions. + // We call setlocale(LC_TIME, "") to initialize from environment variables, + // then read the abbreviated month names. This is called once (via OnceLock) + // and cached, so the race window with other setlocale callers is minimal. + // The nl_langinfo return pointer is immediately copied below. + unsafe { + libc::setlocale(libc::LC_TIME, c"".as_ptr()); + } + + let mut months: [Vec; 12] = Default::default(); + for (i, &item) in abmon_items.iter().enumerate() { + // SAFETY: nl_langinfo returns a valid C string pointer for valid nl_item values. + let ptr = unsafe { libc::nl_langinfo(item) }; + if ptr.is_null() { + return None; + } + let name = unsafe { CStr::from_ptr(ptr) }.to_bytes(); + if name.is_empty() { + return None; + } + // Strip blanks and uppercase using ASCII case folding, matching GNU behavior + months[i] = name + .iter() + .filter(|&&b| !b.is_ascii_whitespace()) + .map(|&b| b.to_ascii_uppercase()) + .collect(); + } + + Some(months) +} + +/// Non-Unix fallback using ICU DateTimeFormatter. +#[cfg(not(any( + target_os = "linux", + target_vendor = "apple", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly" +)))] +fn get_locale_months_inner() -> Option<[Vec; 12]> { + let (locale, _) = get_time_locale(); + let locale_prefs = locale.clone().into(); + // M::medium() produces abbreviated month names (e.g. "Jan", "Feb") matching + // nl_langinfo(ABMON_*) on Unix. M::short() produces numeric ("1", "2") and + // M::long() produces full names ("January", "February"). + let formatter = DateTimeFormatter::try_new(locale_prefs, fieldsets::M::medium()).ok()?; + + let mut months: [Vec; 12] = Default::default(); + for i in 0..12u8 { + let iso_date = Date::::try_new_iso(2000, i + 1, 1).ok()?; + let formatted = formatter.format(&iso_date).to_string(); + // Strip blanks and uppercase using ASCII case folding + months[i as usize] = formatted + .bytes() + .filter(|b| !b.is_ascii_whitespace()) + .map(|b| b.to_ascii_uppercase()) + .collect(); + } + + Some(months) +} + #[cfg(test)] mod tests { use super::*; + /// Verify that ICU `M::medium()` produces abbreviated month names matching + /// what `nl_langinfo(ABMON_*)` returns on Unix. This is the format used by + /// the non-Unix fallback in `get_locale_months_inner`. + #[test] + fn test_icu_medium_month_produces_abbreviated_names() { + use icu_locale::locale; + + let locale: Locale = locale!("en-US"); + let formatter = DateTimeFormatter::try_new(locale.into(), fieldsets::M::medium()).unwrap(); + + let expected = [ + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + ]; + + for (i, exp) in expected.iter().enumerate() { + let iso_date = Date::::try_new_iso(2000, (i + 1) as u8, 1).unwrap(); + let formatted = formatter.format(&iso_date).to_string(); + assert_eq!( + &formatted, + exp, + "M::medium() for month {} should produce abbreviated name", + i + 1 + ); + } + } + + /// Confirm that M::short() gives numeric months and M::long() gives full names, + /// so M::medium() is the only correct choice for abbreviated month names. + #[test] + fn test_icu_short_and_long_month_formats_differ() { + use icu_locale::locale; + + let locale: Locale = locale!("en-US"); + let iso_jan = Date::::try_new_iso(2000, 1, 1).unwrap(); + + let short_fmt = + DateTimeFormatter::try_new(locale.clone().into(), fieldsets::M::short()).unwrap(); + let long_fmt = DateTimeFormatter::try_new(locale.into(), fieldsets::M::long()).unwrap(); + + // M::short() produces numeric ("1"), not "Jan" + assert_eq!(short_fmt.format(&iso_jan).to_string(), "1"); + // M::long() produces full name ("January"), not "Jan" + assert_eq!(long_fmt.format(&iso_jan).to_string(), "January"); + } + #[test] fn test_calendar_type_detection() { use icu_locale::locale; diff --git a/tests/by-util/test_date.rs b/tests/by-util/test_date.rs index db4637e6273..a6139848c08 100644 --- a/tests/by-util/test_date.rs +++ b/tests/by-util/test_date.rs @@ -13,6 +13,8 @@ use regex::Regex; #[cfg(all(unix, not(target_os = "macos")))] use uucore::process::geteuid; use uutests::util::TestScenario; +#[cfg(unix)] +use uutests::util::is_locale_available; use uutests::{at_and_ucmd, new_ucmd, util_name}; #[test] @@ -1840,20 +1842,7 @@ fn test_date_parenthesis_vs_other_special_chars() { fn test_date_iranian_locale_solar_hijri_calendar() { // Test Iranian locale uses Solar Hijri calendar // Verify the Solar Hijri calendar is used in the Iranian locale - use std::process::Command; - - // Check if Iranian locale is available - let locale_check = Command::new("locale") - .env("LC_ALL", "fa_IR.UTF-8") - .arg("charmap") - .output(); - - let locale_available = match locale_check { - Ok(output) => String::from_utf8_lossy(&output.stdout).trim() == "UTF-8", - Err(_) => false, - }; - - if !locale_available { + if !is_locale_available("fa_IR.UTF-8") { println!("Skipping Iranian locale test - fa_IR.UTF-8 locale not available"); return; } @@ -1920,20 +1909,7 @@ fn test_date_iranian_locale_solar_hijri_calendar() { fn test_date_ethiopian_locale_calendar() { // Test Ethiopian locale uses Ethiopian calendar // Verify the Ethiopian calendar is used in the Ethiopian locale - use std::process::Command; - - // Check if Ethiopian locale is available - let locale_check = Command::new("locale") - .env("LC_ALL", "am_ET.UTF-8") - .arg("charmap") - .output(); - - let locale_available = match locale_check { - Ok(output) => String::from_utf8_lossy(&output.stdout).trim() == "UTF-8", - Err(_) => false, - }; - - if !locale_available { + if !is_locale_available("am_ET.UTF-8") { println!("Skipping Ethiopian locale test - am_ET.UTF-8 locale not available"); return; } @@ -2000,20 +1976,7 @@ fn test_date_ethiopian_locale_calendar() { fn test_date_thai_locale_solar_calendar() { // Test Thai locale uses Thai solar calendar // Verify the Thai solar calendar is used with the Thai locale - use std::process::Command; - - // Check if Thai locale is available - let locale_check = Command::new("locale") - .env("LC_ALL", "th_TH.UTF-8") - .arg("charmap") - .output(); - - let locale_available = match locale_check { - Ok(output) => String::from_utf8_lossy(&output.stdout).trim() == "UTF-8", - Err(_) => false, - }; - - if !locale_available { + if !is_locale_available("th_TH.UTF-8") { println!("Skipping Thai locale test - th_TH.UTF-8 locale not available"); return; } diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index d92f6da003b..25d4a143f62 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (words) ints (linux) NOFILE dfgi +// spell-checker:ignore (words) ints (linux) NOFILE dfgi abmon #![allow(clippy::cast_possible_wrap)] use std::env; @@ -15,6 +15,8 @@ use std::time::Duration; use uutests::at_and_ucmd; use uutests::new_ucmd; use uutests::util::TestScenario; +#[cfg(unix)] +use uutests::util::is_locale_available; fn test_helper(file_name: &str, possible_args: &[&str]) { for args in possible_args { @@ -603,6 +605,134 @@ fn test_month_default2() { } } +/// Query the system for abbreviated month names via `locale abmon`. +/// Returns a vector of 12 month abbreviations in order (Jan..Dec), +/// or None if the command fails or returns unexpected output. +#[cfg(any(target_vendor = "apple", target_os = "openbsd"))] +fn get_system_abmon(locale: &str) -> Option> { + let output = Command::new("locale") + .env("LC_ALL", locale) + .arg("abmon") + .output() + .ok()?; + if !output.status.success() { + return None; + } + let text = String::from_utf8(output.stdout).ok()?; + let months: Vec = text.trim().split(';').map(String::from).collect(); + if months.len() == 12 && months.iter().all(|m| !m.is_empty()) { + Some(months) + } else { + None + } +} + +/// Build shuffled input and sorted expected output from month names. +#[cfg(any(target_vendor = "apple", target_os = "openbsd"))] +fn month_sort_input_expected(months: &[String]) -> (String, String) { + use std::fmt::Write; + // Shuffled order: May, Dec, Jan, Jun, Feb, Mar, Apr, Jul, Aug, Sep, Oct, Nov + let shuffle_order = [4, 11, 0, 5, 1, 2, 3, 6, 7, 8, 9, 10]; + let input = shuffle_order.iter().fold(String::new(), |mut s, &i| { + writeln!(s, "{}", months[i]).unwrap(); + s + }); + let expected = months.iter().fold(String::new(), |mut s, m| { + writeln!(s, "{m}").unwrap(); + s + }); + (input, expected) +} + +#[test] +#[cfg(unix)] +fn test_month_sort_french_locale() { + let locale = "fr_FR.UTF-8"; + if !is_locale_available(locale) { + return; + } + // spell-checker:disable + // On macOS/OpenBSD, abbreviated month names vary across OS versions (different CLDR data), + // so we query the system dynamically. On other platforms, glibc values are stable. + #[cfg(any(target_vendor = "apple", target_os = "openbsd"))] + let (input, expected) = { + let Some(months) = get_system_abmon(locale) else { + return; + }; + month_sort_input_expected(&months) + }; + #[cfg(not(any(target_vendor = "apple", target_os = "openbsd")))] + let (input, expected) = ( + "mai\ndéc.\njanv.\njuin\nfévr.\nmars\navril\njuil.\naoût\nsept.\noct.\nnov.\n".to_string(), + "janv.\nfévr.\nmars\navril\nmai\njuin\njuil.\naoût\nsept.\noct.\nnov.\ndéc.\n".to_string(), + ); + // spell-checker:enable + new_ucmd!() + .env("LC_ALL", locale) + .arg("-M") + .pipe_in(input) + .succeeds() + .stdout_is(expected); +} + +#[test] +#[cfg(unix)] +fn test_month_sort_hungarian_locale() { + let locale = "hu_HU.UTF-8"; + if !is_locale_available(locale) { + return; + } + // spell-checker:disable + #[cfg(any(target_vendor = "apple", target_os = "openbsd"))] + let (input, expected) = { + let Some(months) = get_system_abmon(locale) else { + return; + }; + month_sort_input_expected(&months) + }; + #[cfg(not(any(target_vendor = "apple", target_os = "openbsd")))] + let (input, expected) = ( + "máj\ndec\njan\njún\nfebr\nmárc\nápr\njúl\naug\nszept\nokt\nnov\n".to_string(), + "jan\nfebr\nmárc\nápr\nmáj\njún\njúl\naug\nszept\nokt\nnov\ndec\n".to_string(), + ); + // spell-checker:enable + new_ucmd!() + .env("LC_ALL", locale) + .arg("-M") + .pipe_in(input) + .succeeds() + .stdout_is(expected); +} + +#[test] +#[cfg(unix)] +fn test_month_sort_japanese_locale() { + let locale = "ja_JP.UTF-8"; + if !is_locale_available(locale) { + return; + } + // On OpenBSD, abbreviated month names may differ, so query dynamically. + #[cfg(target_os = "openbsd")] + let (input, expected) = { + let Some(months) = get_system_abmon(locale) else { + return; + }; + month_sort_input_expected(&months) + }; + // Japanese abbreviated months are numeric (1月..12月) on glibc/macOS + #[cfg(not(target_os = "openbsd"))] + let (input, expected) = ( + "5月\n12月\n1月\n6月\n2月\n3月\n4月\n7月\n8月\n9月\n10月\n11月\n".to_string(), + "1月\n2月\n3月\n4月\n5月\n6月\n7月\n8月\n9月\n10月\n11月\n12月\n".to_string(), + ); + new_ucmd!() + .env("LC_ALL", locale) + .arg("-M") + .pipe_in(input) + .succeeds() + .stdout_is(expected); +} + #[test] fn test_default_unsorted_ints2() { let input = "9\n1909888\n000\n1\n2"; diff --git a/tests/uutests/src/lib/util.rs b/tests/uutests/src/lib/util.rs index 16258400710..6e5e9690d24 100644 --- a/tests/uutests/src/lib/util.rs +++ b/tests/uutests/src/lib/util.rs @@ -100,6 +100,19 @@ pub fn is_ci() -> bool { env::var("CI").is_ok_and(|s| s.eq_ignore_ascii_case("true")) } +/// Check if a locale is available on the system by verifying that +/// `locale charmap` returns `"UTF-8"` when `LC_ALL` is set to the given locale. +#[cfg(unix)] +pub fn is_locale_available(locale: &str) -> bool { + use std::process::Command; + Command::new("locale") + .env("LC_ALL", locale) + .arg("charmap") + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).trim() == "UTF-8") + .unwrap_or(false) +} + /// Read a test scenario fixture, returning its bytes fn read_scenario_fixture>(tmpd: Option<&Rc>, file_rel_path: S) -> Vec { let tmpdir_path = tmpd.as_ref().unwrap().as_ref().path();