diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index c468ef0ea74..877d342db7e 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -254,3 +254,5 @@ Hijri Nowruz charmap hijri + +CTYPE diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs index 9a8ee1fc649..1c3551ad455 100644 --- a/src/uucore/src/lib/features/i18n/collator.rs +++ b/src/uucore/src/lib/features/i18n/collator.rs @@ -59,10 +59,12 @@ pub fn should_use_locale_collation() -> bool { /// } /// ``` pub fn init_locale_collation() -> bool { - use crate::i18n::{UEncoding, get_locale_encoding}; + use crate::i18n::UEncoding; - // Check if we need locale-aware collation - if get_locale_encoding() != UEncoding::Utf8 { + // Check if we need locale-aware collation. Collation is governed by + // LC_COLLATE, not LC_CTYPE, so read the encoding off the collating locale + // directly instead of going through get_locale_encoding(). + if get_collating_locale().1 != UEncoding::Utf8 { // C/POSIX locale - no collator needed return false; } diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index 282baf2e768..6bec1c14758 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -34,11 +34,14 @@ const DEFAULT_LOCALE: Locale = locale!("und"); /// 2. `locale_name` /// 3. LANG /// +/// Per POSIX, an empty value means "unset" for locale category resolution, +/// so we skip empty values and fall through to the next variable. +/// /// Or fallback on Posix locale, with ASCII encoding. pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { let locale_var = ["LC_ALL", locale_name, "LANG"] .iter() - .find_map(|&key| std::env::var(key).ok()); + .find_map(|&key| std::env::var(key).ok().filter(|v| !v.is_empty())); if let Some(locale_var_str) = locale_var { let mut split = locale_var_str.split(&['.', '@']); @@ -81,6 +84,13 @@ pub fn get_collating_locale() -> &'static (Locale, UEncoding) { COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE")) } +/// Get the character-classification locale from the environment. +pub fn get_ctype_locale() -> &'static (Locale, UEncoding) { + static CTYPE_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); + + CTYPE_LOCALE.get_or_init(|| get_locale_from_env("LC_CTYPE")) +} + /// Get the numeric locale from the environment pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); @@ -89,6 +99,9 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { } /// Return the encoding deduced from the locale environment variable. +/// +/// Character classification (used to decide whether bytes are printable) is +/// governed by LC_CTYPE, not LC_COLLATE. pub fn get_locale_encoding() -> UEncoding { - get_collating_locale().1 + get_ctype_locale().1 } diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index 52e639715a4..94b058b4083 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. // spell-checker:ignore (words) READMECAREFULLY birthtime doesntexist oneline somebackup lrwx somefile somegroup somehiddenbackup somehiddenfile tabsize aaaaaaaa bbbb cccc dddddddd ncccc neee naaaaa nbcdef nfffff dired subdired tmpfs mdir COLORTERM mexe bcdef mfoo timefile -// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE +// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE tést #![allow( clippy::similar_names, clippy::too_many_lines, @@ -3921,6 +3921,50 @@ fn test_ls_quoting_style_arg_overrides_env_var() { } } +// Regression test for https://github.com/uutils/coreutils/issues/12011: +// LC_COLLATE governs sort order, not the encoding used to classify filename +// bytes for display (that is LC_CTYPE). With a UTF-8 LC_CTYPE/LANG and +// LC_COLLATE=C, a UTF-8 filename must still be printed as-is. +#[test] +#[cfg(unix)] +fn test_ls_lc_collate_does_not_affect_display() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("tést"); + + // Empty LC_ALL overrides the test harness default of LC_ALL=C so the + // LC_CTYPE / LANG fallback is what governs character classification. + scene + .ucmd() + .env("LC_ALL", "") + .env("LANG", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("--quoting-style=shell-escape") + .succeeds() + .stdout_only("tést\n"); + + scene + .ucmd() + .env("LC_ALL", "") + .env("LANG", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("-b") + .succeeds() + .stdout_only("tést\n"); + + // Reproducer from https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665: + // explicit LC_CTYPE=UTF-8 with LC_COLLATE=C must still print UTF-8 names as-is. + at.touch("あいうえお"); + scene + .ucmd() + .env("LC_ALL", "") + .env("LC_CTYPE", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("あいうえお") + .succeeds() + .stdout_only("あいうえお\n"); +} + #[test] fn test_ls_quoting_and_color() { let scene = TestScenario::new(util_name!()); diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index b0e7602fd7f..9eaadc1871d 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -790,6 +790,40 @@ fn test_month_sort_japanese_locale() { .stdout_is(expected); } +// Regression test: collation must follow LC_COLLATE, not LC_CTYPE. With a +// UTF-8 LC_COLLATE and LC_CTYPE=C, the ICU collator should still be +// initialized so accented characters sort near their base letters instead of +// by raw byte value. +#[test] +#[cfg(unix)] +fn test_sort_lc_collate_independent_of_lc_ctype() { + let locale = "fr_FR.UTF-8"; + if !is_locale_available(locale) { + return; + } + new_ucmd!() + .env("LC_ALL", "") + .env("LANG", "C") + .env("LC_COLLATE", locale) + .env("LC_CTYPE", "C") + .pipe_in("z\né\n") + .succeeds() + .stdout_only("é\nz\n"); + + // Inverse direction, from + // https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665: + // LC_CTYPE=UTF-8 with LC_COLLATE=C must produce byte-wise ordering, not + // UTF-8 collation. Byte order: B (0x42) < a (0x61) < z (0x7a) < é (0xc3a9). + new_ucmd!() + .env("LC_ALL", "") + .env("LANG", "C") + .env("LC_CTYPE", locale) + .env("LC_COLLATE", "C") + .pipe_in("z\né\na\nB\n") + .succeeds() + .stdout_only("B\na\nz\né\n"); +} + #[test] fn test_default_unsorted_ints2() { let input = "9\n1909888\n000\n1\n2";