uutils · sylvestre · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt
@@ -254,3 +254,5 @@ Hijri
 Nowruz
 charmap
 hijri
+
+CTYPE
diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs
@@ -59,10 +59,12 @@ pub fn should_use_locale_collation() -> bool {
 /// }
 /// ```
 pub fn init_locale_collation() -> bool {
-    use crate::i18n::{UEncoding, get_locale_encoding};
+    use crate::i18n::UEncoding;
 
-    // Check if we need locale-aware collation
-    if get_locale_encoding() != UEncoding::Utf8 {
+    // Check if we need locale-aware collation. Collation is governed by
+    // LC_COLLATE, not LC_CTYPE, so read the encoding off the collating locale
+    // directly instead of going through get_locale_encoding().
+    if get_collating_locale().1 != UEncoding::Utf8 {
         // C/POSIX locale - no collator needed
         return false;
     }

diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs
@@ -34,11 +34,14 @@ const DEFAULT_LOCALE: Locale = locale!("und");
 /// 2. `locale_name`
 /// 3. LANG
 ///
+/// Per POSIX, an empty value means "unset" for locale category resolution,
+/// so we skip empty values and fall through to the next variable.
+///
 /// Or fallback on Posix locale, with ASCII encoding.
 pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
     let locale_var = ["LC_ALL", locale_name, "LANG"]
         .iter()
-        .find_map(|&key| std::env::var(key).ok());
+        .find_map(|&key| std::env::var(key).ok().filter(|v| !v.is_empty()));
 
     if let Some(locale_var_str) = locale_var {
         let mut split = locale_var_str.split(&['.', '@']);
@@ -81,6 +84,13 @@ pub fn get_collating_locale() -> &'static (Locale, UEncoding) {
     COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE"))
 }
 
+/// Get the character-classification locale from the environment.
+pub fn get_ctype_locale() -> &'static (Locale, UEncoding) {
+    static CTYPE_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
+
+    CTYPE_LOCALE.get_or_init(|| get_locale_from_env("LC_CTYPE"))
+}
+
 /// Get the numeric locale from the environment
 pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
     static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
@@ -89,6 +99,9 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
 }
 
 /// Return the encoding deduced from the locale environment variable.
+///
+/// Character classification (used to decide whether bytes are printable) is
+/// governed by LC_CTYPE, not LC_COLLATE.
 pub fn get_locale_encoding() -> UEncoding {
-    get_collating_locale().1
+    get_ctype_locale().1
 }
diff --git a/tests/by-util/test_ls.rs b/tests/by-util/test_ls.rs
@@ -3,7 +3,7 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 // spell-checker:ignore (words) READMECAREFULLY birthtime doesntexist oneline somebackup lrwx somefile somegroup somehiddenbackup somehiddenfile tabsize aaaaaaaa bbbb cccc dddddddd ncccc neee naaaaa nbcdef nfffff dired subdired tmpfs mdir COLORTERM mexe bcdef mfoo timefile
-// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE
+// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE tést
 #![allow(
     clippy::similar_names,
     clippy::too_many_lines,
@@ -3921,6 +3921,50 @@ fn test_ls_quoting_style_arg_overrides_env_var() {
     }
 }
 
+// Regression test for https://github.com/uutils/coreutils/issues/12011:
+// LC_COLLATE governs sort order, not the encoding used to classify filename
+// bytes for display (that is LC_CTYPE). With a UTF-8 LC_CTYPE/LANG and
+// LC_COLLATE=C, a UTF-8 filename must still be printed as-is.
+#[test]
+#[cfg(unix)]
+fn test_ls_lc_collate_does_not_affect_display() {
+    let scene = TestScenario::new(util_name!());
+    let at = &scene.fixtures;
+    at.touch("tést");
+
+    // Empty LC_ALL overrides the test harness default of LC_ALL=C so the
+    // LC_CTYPE / LANG fallback is what governs character classification.
+    scene
+        .ucmd()
+        .env("LC_ALL", "")
+        .env("LANG", "en_US.UTF-8")
+        .env("LC_COLLATE", "C")
+        .arg("--quoting-style=shell-escape")
+        .succeeds()
+        .stdout_only("tést\n");
+
+    scene
+        .ucmd()
+        .env("LC_ALL", "")
+        .env("LANG", "en_US.UTF-8")
+        .env("LC_COLLATE", "C")
+        .arg("-b")
+        .succeeds()
+        .stdout_only("tést\n");
+
+    // Reproducer from https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665:
+    // explicit LC_CTYPE=UTF-8 with LC_COLLATE=C must still print UTF-8 names as-is.
+    at.touch("あいうえお");
+    scene
+        .ucmd()
+        .env("LC_ALL", "")
+        .env("LC_CTYPE", "en_US.UTF-8")
+        .env("LC_COLLATE", "C")
+        .arg("あいうえお")
+        .succeeds()
+        .stdout_only("あいうえお\n");
+}
+
 #[test]
 fn test_ls_quoting_and_color() {
     let scene = TestScenario::new(util_name!());

diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs
@@ -790,6 +790,40 @@ fn test_month_sort_japanese_locale() {
         .stdout_is(expected);
 }
 
+// Regression test: collation must follow LC_COLLATE, not LC_CTYPE. With a
+// UTF-8 LC_COLLATE and LC_CTYPE=C, the ICU collator should still be
+// initialized so accented characters sort near their base letters instead of
+// by raw byte value.
+#[test]
+#[cfg(unix)]
+fn test_sort_lc_collate_independent_of_lc_ctype() {
+    let locale = "fr_FR.UTF-8";
+    if !is_locale_available(locale) {
+        return;
+    }
+    new_ucmd!()
+        .env("LC_ALL", "")
+        .env("LANG", "C")
+        .env("LC_COLLATE", locale)
+        .env("LC_CTYPE", "C")
+        .pipe_in("z\né\n")
+        .succeeds()
+        .stdout_only("é\nz\n");
+
+    // Inverse direction, from
+    // https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665:
+    // LC_CTYPE=UTF-8 with LC_COLLATE=C must produce byte-wise ordering, not
+    // UTF-8 collation. Byte order: B (0x42) < a (0x61) < z (0x7a) < é (0xc3a9).
+    new_ucmd!()
+        .env("LC_ALL", "")
+        .env("LANG", "C")
+        .env("LC_CTYPE", locale)
+        .env("LC_COLLATE", "C")
+        .pipe_in("z\né\na\nB\n")
+        .succeeds()
+        .stdout_only("B\na\nz\né\n");
+}
+
 #[test]
 fn test_default_unsorted_ints2() {
     let input = "9\n1909888\n000\n1\n2";
-Original file line number
+Diff line change
@@ Expand Up / @@ -254,3 +254,5 @@ Hijri @@
     Nowruz
     charmap
     hijri
+    CTYPE