Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ uucore = { workspace = true, features = [
"version-cmp",
"i18n-decimal",
"i18n-collator",
"i18n-datetime",
] }
fluent = { workspace = true }
foldhash = { workspace = true }
Expand All @@ -61,6 +62,7 @@ uucore = { workspace = true, features = [
"parser-size",
"version-cmp",
"i18n-collator",
"i18n-datetime",
] }

[[bin]]
Expand Down
151 changes: 120 additions & 31 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html

// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv foldhash
// spell-checker:ignore (misc) uppercased qsort getmonth juin juil

mod buffer_hint;
mod check;
Expand Down Expand Up @@ -42,13 +43,15 @@ use std::os::unix::ffi::OsStrExt;
use std::path::Path;
use std::path::PathBuf;
use std::str::Utf8Error;
use std::sync::OnceLock;
use thiserror::Error;
use uucore::display::Quotable;
use uucore::error::{FromIo, strip_errno};
use uucore::error::{UError, UResult, USimpleError, UUsageError};
use uucore::extendedbigdecimal::ExtendedBigDecimal;
#[cfg(feature = "i18n-collator")]
use uucore::i18n::collator::{compute_sort_key_utf8, locale_cmp};
use uucore::i18n::datetime::get_locale_months;
use uucore::i18n::decimal::locale_decimal_separator;
use uucore::line_ending::LineEnding;
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
Expand Down Expand Up @@ -787,26 +790,20 @@ impl<'a> Line<'a> {
}
SortMode::Month => {
let initial_selection = &self.line[selection.clone()];

let mut month_chars = initial_selection
let trimmed = initial_selection
Copy link
Contributor

@cakebaker cakebaker Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name trimmed is a bit misleading as it contains a position (and not a trimmed string).

.iter()
.enumerate()
.skip_while(|(_, c)| c.is_ascii_whitespace());
.position(|c| !c.is_ascii_whitespace())
.unwrap_or(initial_selection.len());

let (parsed, match_len) = month_parse(initial_selection);

let month = if month_parse(initial_selection) == Month::Unknown {
let month = if parsed == Month::Unknown {
// We failed to parse a month, which is equivalent to matching nothing.
// Add the "no match for key" marker to the first non-whitespace character.
let first_non_whitespace = month_chars.next();
first_non_whitespace.map_or(
initial_selection.len()..initial_selection.len(),
|(idx, _)| idx..idx,
)
trimmed..trimmed
} else {
// We parsed a month. Match the first three non-whitespace characters, which must be the month we parsed.
month_chars.next().unwrap().0
..month_chars
.nth(2)
.map_or(initial_selection.len(), |(idx, _)| idx)
// We parsed a month. Use the actual match byte length.
trimmed..(trimmed + match_len)
Comment on lines +803 to +806
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might make sense to set selection.start and selection.end directly in the if/else blocks instead of returning a range.

};

// Shorten selection to month.
Expand Down Expand Up @@ -3017,30 +3014,122 @@ enum Month {
December,
}

/// Cached locale month lookup table.
/// Each entry is (uppercased_blank_stripped_name, month_value).
type MonthTable = Vec<(Vec<u8>, Month)>;

fn get_locale_month_table() -> Option<&'static MonthTable> {
static TABLE: OnceLock<Option<MonthTable>> = OnceLock::new();

TABLE
.get_or_init(|| {
let months = get_locale_months()?;
let all_months = [
Month::January,
Month::February,
Month::March,
Month::April,
Month::May,
Month::June,
Month::July,
Month::August,
Month::September,
Month::October,
Month::November,
Month::December,
];
let table: Vec<(Vec<u8>, Month)> = months
.iter()
.zip(all_months.iter())
.map(|(name, &month)| (name.clone(), month))
.collect();
Some(table)
})
.as_ref()
}

/// Parse the beginning string into a Month, returning [`Month::Unknown`] on errors.
fn month_parse(line: &[u8]) -> Month {
/// Also returns the byte length consumed from the input (after leading blanks).
fn month_parse(line: &[u8]) -> (Month, usize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function is not entirely correct.

Here is an example where I get a different output compared to GNU sort:

$ printf "juin\nav   ril\nmars\nfevr." | LC_ALL=fr_FR.UTF-8 cargo run -q sort -M
fevr.
mars
av   ril
juin
$ printf "juin\nav   ril\nmars\nfevr." | LC_ALL=fr_FR.UTF-8 sort -M
av   ril
fevr.
mars
juin

let line = line.trim_ascii_start();

// Try locale-specific month names by scanning all entries.
// We track the longest match to handle cases where one month name is a
// prefix of another (e.g., "juin" vs "juil." in French).
if let Some(table) = get_locale_month_table() {
let mut best_month = Month::Unknown;
let mut best_consumed: usize = 0;

for &(ref name, month) in table {
let mut m_iter = line.iter();
let mut n_iter = name.iter();
let mut consumed = 0;
let mut matched = true;

loop {
match n_iter.next() {
None => {
// Matched the entire month name
break;
}
Some(&n_byte) => {
// Skip blanks in input (matching GNU behavior)
let m_byte = loop {
match m_iter.next() {
Some(&b) if b.is_ascii_whitespace() => {
consumed += 1;
}
Some(&b) => {
consumed += 1;
break b.to_ascii_uppercase();
}
None => {
// Input exhausted before month name ended.
break 0;
}
}
};

if m_byte != n_byte {
matched = false;
break;
}
}
}
}

if matched && consumed > best_consumed {
best_month = month;
best_consumed = consumed;
}
}

if best_month != Month::Unknown {
return (best_month, best_consumed);
}
}

// Fall back to English 3-letter abbreviations
match line.get(..3).map(<[u8]>::to_ascii_uppercase).as_deref() {
Some(b"JAN") => Month::January,
Some(b"FEB") => Month::February,
Some(b"MAR") => Month::March,
Some(b"APR") => Month::April,
Some(b"MAY") => Month::May,
Some(b"JUN") => Month::June,
Some(b"JUL") => Month::July,
Some(b"AUG") => Month::August,
Some(b"SEP") => Month::September,
Some(b"OCT") => Month::October,
Some(b"NOV") => Month::November,
Some(b"DEC") => Month::December,
_ => Month::Unknown,
Some(b"JAN") => (Month::January, 3),
Some(b"FEB") => (Month::February, 3),
Some(b"MAR") => (Month::March, 3),
Some(b"APR") => (Month::April, 3),
Some(b"MAY") => (Month::May, 3),
Some(b"JUN") => (Month::June, 3),
Some(b"JUL") => (Month::July, 3),
Some(b"AUG") => (Month::August, 3),
Some(b"SEP") => (Month::September, 3),
Some(b"OCT") => (Month::October, 3),
Some(b"NOV") => (Month::November, 3),
Some(b"DEC") => (Month::December, 3),
_ => (Month::Unknown, 0),
}
}

fn month_compare(a: &[u8], b: &[u8]) -> Ordering {
let ma = month_parse(a);
let mb = month_parse(b);
let ma = month_parse(a).0;
let mb = month_parse(b).0;

ma.cmp(&mb)
}
Expand Down
157 changes: 156 additions & 1 deletion src/uucore/src/lib/features/i18n/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// spell-checker:ignore fieldsets prefs febr
// spell-checker:ignore fieldsets prefs febr abmon langinfo uppercased

//! Locale-aware datetime formatting utilities using ICU and jiff-icu

Expand Down Expand Up @@ -137,10 +137,165 @@ pub fn localize_format_string(format: &str, date: JiffDate) -> String {
fmt.replace(PERCENT_PLACEHOLDER, "%%")
}

/// Abbreviated month names for the current LC_TIME locale as raw bytes,
/// with blanks stripped and uppercased using ASCII case folding.
///
/// Each entry corresponds to months January (index 0) through December (index 11).
/// Returns `None` for C/POSIX locale (caller should use English defaults).
/// This matches the GNU coreutils approach of storing uppercased, blank-stripped names.
pub fn get_locale_months() -> Option<&'static [Vec<u8>; 12]> {
static LOCALE_MONTHS: OnceLock<Option<[Vec<u8>; 12]>> = OnceLock::new();

LOCALE_MONTHS
.get_or_init(|| {
if !should_use_icu_locale() {
return None;
}
get_locale_months_inner()
})
.as_ref()
}

/// Unix implementation using nl_langinfo for exact match with `locale abmon` output.
#[cfg(any(
target_os = "linux",
target_vendor = "apple",
target_os = "freebsd",
target_os = "netbsd",
target_os = "openbsd",
target_os = "dragonfly"
))]
fn get_locale_months_inner() -> Option<[Vec<u8>; 12]> {
use nix::libc;
use std::ffi::CStr;

let abmon_items: [libc::nl_item; 12] = [
libc::ABMON_1,
libc::ABMON_2,
libc::ABMON_3,
libc::ABMON_4,
libc::ABMON_5,
libc::ABMON_6,
libc::ABMON_7,
libc::ABMON_8,
libc::ABMON_9,
libc::ABMON_10,
libc::ABMON_11,
libc::ABMON_12,
];

// SAFETY: setlocale and nl_langinfo are standard POSIX functions.
// We call setlocale(LC_TIME, "") to initialize from environment variables,
// then read the abbreviated month names. This is called once (via OnceLock)
// and cached, so the race window with other setlocale callers is minimal.
// The nl_langinfo return pointer is immediately copied below.
unsafe {
libc::setlocale(libc::LC_TIME, c"".as_ptr());
}

let mut months: [Vec<u8>; 12] = Default::default();
for (i, &item) in abmon_items.iter().enumerate() {
// SAFETY: nl_langinfo returns a valid C string pointer for valid nl_item values.
let ptr = unsafe { libc::nl_langinfo(item) };
if ptr.is_null() {
return None;
}
let name = unsafe { CStr::from_ptr(ptr) }.to_bytes();
if name.is_empty() {
return None;
}
// Strip blanks and uppercase using ASCII case folding, matching GNU behavior
months[i] = name
.iter()
.filter(|&&b| !b.is_ascii_whitespace())
.map(|&b| b.to_ascii_uppercase())
.collect();
}

Some(months)
}

/// Non-Unix fallback using ICU DateTimeFormatter.
#[cfg(not(any(
target_os = "linux",
target_vendor = "apple",
target_os = "freebsd",
target_os = "netbsd",
target_os = "openbsd",
target_os = "dragonfly"
)))]
fn get_locale_months_inner() -> Option<[Vec<u8>; 12]> {
let (locale, _) = get_time_locale();
let locale_prefs = locale.clone().into();
// M::medium() produces abbreviated month names (e.g. "Jan", "Feb") matching
// nl_langinfo(ABMON_*) on Unix. M::short() produces numeric ("1", "2") and
// M::long() produces full names ("January", "February").
let formatter = DateTimeFormatter::try_new(locale_prefs, fieldsets::M::medium()).ok()?;

let mut months: [Vec<u8>; 12] = Default::default();
for i in 0..12u8 {
let iso_date = Date::<Iso>::try_new_iso(2000, i + 1, 1).ok()?;
let formatted = formatter.format(&iso_date).to_string();
// Strip blanks and uppercase using ASCII case folding
months[i as usize] = formatted
.bytes()
.filter(|b| !b.is_ascii_whitespace())
.map(|b| b.to_ascii_uppercase())
.collect();
}

Some(months)
}

#[cfg(test)]
mod tests {
use super::*;

/// Verify that ICU `M::medium()` produces abbreviated month names matching
/// what `nl_langinfo(ABMON_*)` returns on Unix. This is the format used by
/// the non-Unix fallback in `get_locale_months_inner`.
#[test]
fn test_icu_medium_month_produces_abbreviated_names() {
use icu_locale::locale;

let locale: Locale = locale!("en-US");
let formatter = DateTimeFormatter::try_new(locale.into(), fieldsets::M::medium()).unwrap();

let expected = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
];

for (i, exp) in expected.iter().enumerate() {
let iso_date = Date::<Iso>::try_new_iso(2000, (i + 1) as u8, 1).unwrap();
let formatted = formatter.format(&iso_date).to_string();
assert_eq!(
&formatted,
exp,
"M::medium() for month {} should produce abbreviated name",
i + 1
);
}
}

/// Confirm that M::short() gives numeric months and M::long() gives full names,
/// so M::medium() is the only correct choice for abbreviated month names.
#[test]
fn test_icu_short_and_long_month_formats_differ() {
use icu_locale::locale;

let locale: Locale = locale!("en-US");
let iso_jan = Date::<Iso>::try_new_iso(2000, 1, 1).unwrap();

let short_fmt =
DateTimeFormatter::try_new(locale.clone().into(), fieldsets::M::short()).unwrap();
let long_fmt = DateTimeFormatter::try_new(locale.into(), fieldsets::M::long()).unwrap();

// M::short() produces numeric ("1"), not "Jan"
assert_eq!(short_fmt.format(&iso_jan).to_string(), "1");
// M::long() produces full name ("January"), not "Jan"
assert_eq!(long_fmt.format(&iso_jan).to_string(), "January");
}

#[test]
fn test_calendar_type_detection() {
use icu_locale::locale;
Expand Down
Loading
Loading