From c8422bdb8bd8f535680d38f7dcef76a2026b83be Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 26 Apr 2026 12:35:48 +0200 Subject: [PATCH 1/4] ls: add ignored regression test for LC_COLLATE display issue (#12011) --- tests/by-util/test_ls.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index 52e639715a4..69f324914eb 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -3921,6 +3921,39 @@ fn test_ls_quoting_style_arg_overrides_env_var() { } } +// Regression test for https://github.com/uutils/coreutils/issues/12011: +// LC_COLLATE governs sort order, not the encoding used to classify filename +// bytes for display (that is LC_CTYPE). With a UTF-8 LC_CTYPE/LANG and +// LC_COLLATE=C, a UTF-8 filename must still be printed as-is. +#[test] +#[cfg(unix)] +#[ignore = "https://github.com/uutils/coreutils/issues/12011"] +fn test_ls_lc_collate_does_not_affect_display() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("tést"); + + // Empty LC_ALL overrides the test harness default of LC_ALL=C so the + // LC_CTYPE / LANG fallback is what governs character classification. + scene + .ucmd() + .env("LC_ALL", "") + .env("LANG", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("--quoting-style=shell-escape") + .succeeds() + .stdout_only("tést\n"); + + scene + .ucmd() + .env("LC_ALL", "") + .env("LANG", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("-b") + .succeeds() + .stdout_only("tést\n"); +} + #[test] fn test_ls_quoting_and_color() { let scene = TestScenario::new(util_name!()); From de0708176af96d21a3db37e4e867039ac73f91ec Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 26 Apr 2026 12:37:22 +0200 Subject: [PATCH 2/4] ls: use LC_CTYPE (not LC_COLLATE) to determine display encoding LC_COLLATE governs sort order, not character classification. Using it to decide whether filename bytes are printable caused UTF-8 names to be escaped whenever LC_COLLATE=C, even with a UTF-8 LC_CTYPE/LANG. Fixes https://github.com/uutils/coreutils/issues/12011 --- src/uucore/src/lib/features/i18n/mod.rs | 17 +++++++++++++++-- tests/by-util/test_ls.rs | 1 - 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index 282baf2e768..6bec1c14758 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -34,11 +34,14 @@ const DEFAULT_LOCALE: Locale = locale!("und"); /// 2. `locale_name` /// 3. LANG /// +/// Per POSIX, an empty value means "unset" for locale category resolution, +/// so we skip empty values and fall through to the next variable. +/// /// Or fallback on Posix locale, with ASCII encoding. pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { let locale_var = ["LC_ALL", locale_name, "LANG"] .iter() - .find_map(|&key| std::env::var(key).ok()); + .find_map(|&key| std::env::var(key).ok().filter(|v| !v.is_empty())); if let Some(locale_var_str) = locale_var { let mut split = locale_var_str.split(&['.', '@']); @@ -81,6 +84,13 @@ pub fn get_collating_locale() -> &'static (Locale, UEncoding) { COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE")) } +/// Get the character-classification locale from the environment. +pub fn get_ctype_locale() -> &'static (Locale, UEncoding) { + static CTYPE_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); + + CTYPE_LOCALE.get_or_init(|| get_locale_from_env("LC_CTYPE")) +} + /// Get the numeric locale from the environment pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); @@ -89,6 +99,9 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { } /// Return the encoding deduced from the locale environment variable. +/// +/// Character classification (used to decide whether bytes are printable) is +/// governed by LC_CTYPE, not LC_COLLATE. pub fn get_locale_encoding() -> UEncoding { - get_collating_locale().1 + get_ctype_locale().1 } diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index 69f324914eb..aee3477bc7a 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -3927,7 +3927,6 @@ fn test_ls_quoting_style_arg_overrides_env_var() { // LC_COLLATE=C, a UTF-8 filename must still be printed as-is. #[test] #[cfg(unix)] -#[ignore = "https://github.com/uutils/coreutils/issues/12011"] fn test_ls_lc_collate_does_not_affect_display() { let scene = TestScenario::new(util_name!()); let at = &scene.fixtures; From b2292dc1ec8b3c931fc6d728d2e7ff8d9e5db528 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 26 Apr 2026 13:36:38 +0200 Subject: [PATCH 3/4] sort: use LC_COLLATE encoding (not LC_CTYPE) to gate collator init init_locale_collation() went through get_locale_encoding(), which now reads LC_CTYPE. Collation must follow LC_COLLATE, so read the encoding off get_collating_locale() directly. Otherwise LC_COLLATE=fr_FR.UTF-8 combined with LC_CTYPE=C would skip the ICU collator and fall back to byte comparison. --- .../cspell.dictionaries/jargon.wordlist.txt | 2 ++ src/uucore/src/lib/features/i18n/collator.rs | 8 ++++--- tests/by-util/test_ls.rs | 2 +- tests/by-util/test_sort.rs | 21 +++++++++++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index c468ef0ea74..877d342db7e 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -254,3 +254,5 @@ Hijri Nowruz charmap hijri + +CTYPE diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs index 9a8ee1fc649..1c3551ad455 100644 --- a/src/uucore/src/lib/features/i18n/collator.rs +++ b/src/uucore/src/lib/features/i18n/collator.rs @@ -59,10 +59,12 @@ pub fn should_use_locale_collation() -> bool { /// } /// ``` pub fn init_locale_collation() -> bool { - use crate::i18n::{UEncoding, get_locale_encoding}; + use crate::i18n::UEncoding; - // Check if we need locale-aware collation - if get_locale_encoding() != UEncoding::Utf8 { + // Check if we need locale-aware collation. Collation is governed by + // LC_COLLATE, not LC_CTYPE, so read the encoding off the collating locale + // directly instead of going through get_locale_encoding(). + if get_collating_locale().1 != UEncoding::Utf8 { // C/POSIX locale - no collator needed return false; } diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index aee3477bc7a..325454e1f19 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. // spell-checker:ignore (words) READMECAREFULLY birthtime doesntexist oneline somebackup lrwx somefile somegroup somehiddenbackup somehiddenfile tabsize aaaaaaaa bbbb cccc dddddddd ncccc neee naaaaa nbcdef nfffff dired subdired tmpfs mdir COLORTERM mexe bcdef mfoo timefile -// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE +// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE tést #![allow( clippy::similar_names, clippy::too_many_lines, diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index b0e7602fd7f..7bb1ee963f7 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -790,6 +790,27 @@ fn test_month_sort_japanese_locale() { .stdout_is(expected); } +// Regression test: collation must follow LC_COLLATE, not LC_CTYPE. With a +// UTF-8 LC_COLLATE and LC_CTYPE=C, the ICU collator should still be +// initialized so accented characters sort near their base letters instead of +// by raw byte value. +#[test] +#[cfg(unix)] +fn test_sort_lc_collate_independent_of_lc_ctype() { + let locale = "fr_FR.UTF-8"; + if !is_locale_available(locale) { + return; + } + new_ucmd!() + .env("LC_ALL", "") + .env("LANG", "C") + .env("LC_COLLATE", locale) + .env("LC_CTYPE", "C") + .pipe_in("z\né\n") + .succeeds() + .stdout_only("é\nz\n"); +} + #[test] fn test_default_unsorted_ints2() { let input = "9\n1909888\n000\n1\n2"; From 44f7c3fd49d15b1d65793022e27aac38b386312e Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 26 Apr 2026 21:06:58 +0200 Subject: [PATCH 4/4] ls,sort: cover both LC_CTYPE/LC_COLLATE asymmetry directions Add the explicit LC_CTYPE=UTF-8 LC_COLLATE=C reproducer from the PR review (Japanese filename for ls; byte-order sort) so both the display fix and the collator-init fix have direct regression coverage. --- tests/by-util/test_ls.rs | 12 ++++++++++++ tests/by-util/test_sort.rs | 13 +++++++++++++ 2 files changed, 25 insertions(+) diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs index 325454e1f19..94b058b4083 100644 --- a/tests/by-util/test_ls.rs +++ b/tests/by-util/test_ls.rs @@ -3951,6 +3951,18 @@ fn test_ls_lc_collate_does_not_affect_display() { .arg("-b") .succeeds() .stdout_only("tést\n"); + + // Reproducer from https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665: + // explicit LC_CTYPE=UTF-8 with LC_COLLATE=C must still print UTF-8 names as-is. + at.touch("あいうえお"); + scene + .ucmd() + .env("LC_ALL", "") + .env("LC_CTYPE", "en_US.UTF-8") + .env("LC_COLLATE", "C") + .arg("あいうえお") + .succeeds() + .stdout_only("あいうえお\n"); } #[test] diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 7bb1ee963f7..9eaadc1871d 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -809,6 +809,19 @@ fn test_sort_lc_collate_independent_of_lc_ctype() { .pipe_in("z\né\n") .succeeds() .stdout_only("é\nz\n"); + + // Inverse direction, from + // https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665: + // LC_CTYPE=UTF-8 with LC_COLLATE=C must produce byte-wise ordering, not + // UTF-8 collation. Byte order: B (0x42) < a (0x61) < z (0x7a) < é (0xc3a9). + new_ucmd!() + .env("LC_ALL", "") + .env("LANG", "C") + .env("LC_CTYPE", locale) + .env("LC_COLLATE", "C") + .pipe_in("z\né\na\nB\n") + .succeeds() + .stdout_only("B\na\nz\né\n"); } #[test]