Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .vscode/cspell.dictionaries/jargon.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,5 @@ Hijri
Nowruz
charmap
hijri

CTYPE
8 changes: 5 additions & 3 deletions src/uucore/src/lib/features/i18n/collator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,12 @@ pub fn should_use_locale_collation() -> bool {
/// }
/// ```
pub fn init_locale_collation() -> bool {
use crate::i18n::{UEncoding, get_locale_encoding};
use crate::i18n::UEncoding;

// Check if we need locale-aware collation
if get_locale_encoding() != UEncoding::Utf8 {
// Check if we need locale-aware collation. Collation is governed by
// LC_COLLATE, not LC_CTYPE, so read the encoding off the collating locale
// directly instead of going through get_locale_encoding().
if get_collating_locale().1 != UEncoding::Utf8 {
// C/POSIX locale - no collator needed
return false;
}
Expand Down
17 changes: 15 additions & 2 deletions src/uucore/src/lib/features/i18n/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@ const DEFAULT_LOCALE: Locale = locale!("und");
/// 2. `locale_name`
/// 3. LANG
///
/// Per POSIX, an empty value means "unset" for locale category resolution,
/// so we skip empty values and fall through to the next variable.
///
/// Or fallback on Posix locale, with ASCII encoding.
pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
let locale_var = ["LC_ALL", locale_name, "LANG"]
.iter()
.find_map(|&key| std::env::var(key).ok());
.find_map(|&key| std::env::var(key).ok().filter(|v| !v.is_empty()));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be nice if we could use std::env::var_os


if let Some(locale_var_str) = locale_var {
let mut split = locale_var_str.split(&['.', '@']);
Expand Down Expand Up @@ -81,6 +84,13 @@ pub fn get_collating_locale() -> &'static (Locale, UEncoding) {
COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE"))
}

/// Get the character-classification locale from the environment.
pub fn get_ctype_locale() -> &'static (Locale, UEncoding) {
static CTYPE_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();

CTYPE_LOCALE.get_or_init(|| get_locale_from_env("LC_CTYPE"))
}

/// Get the numeric locale from the environment
pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
Expand All @@ -89,6 +99,9 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
}

/// Return the encoding deduced from the locale environment variable.
///
/// Character classification (used to decide whether bytes are printable) is
/// governed by LC_CTYPE, not LC_COLLATE.
pub fn get_locale_encoding() -> UEncoding {
get_collating_locale().1
get_ctype_locale().1
}
46 changes: 45 additions & 1 deletion tests/by-util/test_ls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (words) READMECAREFULLY birthtime doesntexist oneline somebackup lrwx somefile somegroup somehiddenbackup somehiddenfile tabsize aaaaaaaa bbbb cccc dddddddd ncccc neee naaaaa nbcdef nfffff dired subdired tmpfs mdir COLORTERM mexe bcdef mfoo timefile
// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE
// spell-checker:ignore (words) fakeroot setcap drwxr bcdlps mdangling mentry awith acolons NOFILE NOTCAPABLE tést
#![allow(
clippy::similar_names,
clippy::too_many_lines,
Expand Down Expand Up @@ -3921,6 +3921,50 @@ fn test_ls_quoting_style_arg_overrides_env_var() {
}
}

// Regression test for https://github.com/uutils/coreutils/issues/12011:
// LC_COLLATE governs sort order, not the encoding used to classify filename
// bytes for display (that is LC_CTYPE). With a UTF-8 LC_CTYPE/LANG and
// LC_COLLATE=C, a UTF-8 filename must still be printed as-is.
#[test]
#[cfg(unix)]
fn test_ls_lc_collate_does_not_affect_display() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("tést");

// Empty LC_ALL overrides the test harness default of LC_ALL=C so the
// LC_CTYPE / LANG fallback is what governs character classification.
scene
.ucmd()
.env("LC_ALL", "")
.env("LANG", "en_US.UTF-8")
.env("LC_COLLATE", "C")
.arg("--quoting-style=shell-escape")
.succeeds()
.stdout_only("tést\n");

scene
.ucmd()
.env("LC_ALL", "")
.env("LANG", "en_US.UTF-8")
.env("LC_COLLATE", "C")
.arg("-b")
.succeeds()
.stdout_only("tést\n");

// Reproducer from https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665:
// explicit LC_CTYPE=UTF-8 with LC_COLLATE=C must still print UTF-8 names as-is.
at.touch("あいうえお");
scene
.ucmd()
.env("LC_ALL", "")
.env("LC_CTYPE", "en_US.UTF-8")
.env("LC_COLLATE", "C")
.arg("あいうえお")
.succeeds()
.stdout_only("あいうえお\n");
}

#[test]
fn test_ls_quoting_and_color() {
let scene = TestScenario::new(util_name!());
Expand Down
34 changes: 34 additions & 0 deletions tests/by-util/test_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,40 @@ fn test_month_sort_japanese_locale() {
.stdout_is(expected);
}

// Regression test: collation must follow LC_COLLATE, not LC_CTYPE. With a
// UTF-8 LC_COLLATE and LC_CTYPE=C, the ICU collator should still be
// initialized so accented characters sort near their base letters instead of
// by raw byte value.
#[test]
#[cfg(unix)]
fn test_sort_lc_collate_independent_of_lc_ctype() {
let locale = "fr_FR.UTF-8";
if !is_locale_available(locale) {
return;
}
new_ucmd!()
.env("LC_ALL", "")
.env("LANG", "C")
.env("LC_COLLATE", locale)
.env("LC_CTYPE", "C")
.pipe_in("z\né\n")
.succeeds()
.stdout_only("é\nz\n");

// Inverse direction, from
// https://github.com/uutils/coreutils/pull/9303#issuecomment-4322263665:
// LC_CTYPE=UTF-8 with LC_COLLATE=C must produce byte-wise ordering, not
// UTF-8 collation. Byte order: B (0x42) < a (0x61) < z (0x7a) < é (0xc3a9).
new_ucmd!()
.env("LC_ALL", "")
.env("LANG", "C")
.env("LC_CTYPE", locale)
.env("LC_COLLATE", "C")
.pipe_in("z\né\na\nB\n")
.succeeds()
.stdout_only("B\na\nz\né\n");
}

#[test]
fn test_default_unsorted_ints2() {
let input = "9\n1909888\n000\n1\n2";
Expand Down
Loading