diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 071163c5aee..d9adb38b042 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -7,7 +7,7 @@ // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html -// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef +// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef localeconv mod buffer_hint; mod check; @@ -57,6 +57,11 @@ use uucore::version_cmp::version_cmp; use crate::buffer_hint::automatic_buffer_size; use crate::tmp_dir::TmpDirWrapper; +#[cfg(unix)] +use nix::libc; +#[cfg(unix)] +use std::ffi::CStr; + mod options { pub mod modes { pub const SORT: &str = "sort"; @@ -294,9 +299,35 @@ pub struct GlobalSettings { buffer_size_is_explicit: bool, compress_prog: Option, merge_batch_size: usize, + numeric_locale: NumericLocaleSettings, precomputed: Precomputed, } +#[derive(Clone, Copy, Debug)] +struct NumericLocaleSettings { + thousands_sep: Option, + decimal_pt: Option, +} + +impl Default for NumericLocaleSettings { + fn default() -> Self { + Self { + thousands_sep: None, + decimal_pt: Some(DECIMAL_PT), + } + } +} + +impl NumericLocaleSettings { + fn num_info_settings(&self, accept_si_units: bool) -> NumInfoParseSettings { + NumInfoParseSettings { + accept_si_units, + thousands_separator: self.thousands_sep, + decimal_pt: self.decimal_pt, + } + } +} + /// Data needed for sorting. Should be computed once before starting to sort /// by calling `GlobalSettings::init_precomputed`. #[derive(Clone, Debug, Default)] @@ -307,6 +338,8 @@ struct Precomputed { selections_per_line: usize, fast_lexicographic: bool, fast_ascii_insensitive: bool, + tokenize_blank_thousands_sep: bool, + tokenize_allow_unit_after_blank: bool, } impl GlobalSettings { @@ -348,6 +381,20 @@ impl GlobalSettings { .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric)) .count(); + let uses_numeric = self + .selectors + .iter() + .any(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric)); + let uses_human_numeric = self + .selectors + .iter() + .any(|s| matches!(s.settings.mode, SortMode::HumanNumeric)); + self.precomputed.tokenize_blank_thousands_sep = self.separator.is_none() + && uses_numeric + && self.numeric_locale.thousands_sep == Some(b' '); + self.precomputed.tokenize_allow_unit_after_blank = + self.precomputed.tokenize_blank_thousands_sep && uses_human_numeric; + self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic(); self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive(); } @@ -415,6 +462,7 @@ impl Default for GlobalSettings { buffer_size_is_explicit: false, compress_prog: None, merge_batch_size: default_merge_batch_size(), + numeric_locale: NumericLocaleSettings::default(), precomputed: Precomputed::default(), } } @@ -524,7 +572,12 @@ impl<'a> Line<'a> { ) -> Self { token_buffer.clear(); if settings.precomputed.needs_tokens { - tokenize(line, settings.separator, token_buffer); + tokenize( + line, + settings.separator, + token_buffer, + &settings.precomputed, + ); } if settings.mode == SortMode::Numeric { // exclude inf, nan, scientific notation @@ -534,11 +587,12 @@ impl<'a> Line<'a> { .and_then(|s| s.parse::().ok()); line_data.line_num_floats.push(line_num_float); } - for (selector, selection) in settings - .selectors - .iter() - .map(|selector| (selector, selector.get_selection(line, token_buffer))) - { + for (selector, selection) in settings.selectors.iter().map(|selector| { + ( + selector, + selector.get_selection(line, token_buffer, &settings.numeric_locale), + ) + }) { match selection { Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float), Selection::WithNumInfo(str, num_info) => { @@ -587,7 +641,12 @@ impl<'a> Line<'a> { writeln!(writer)?; let mut fields = vec![]; - tokenize(self.line, settings.separator, &mut fields); + tokenize( + self.line, + settings.separator, + &mut fields, + &settings.precomputed, + ); for selector in &settings.selectors { let mut selection = selector.get_range(self.line, Some(&fields)); match selector.settings.mode { @@ -595,10 +654,9 @@ impl<'a> Line<'a> { // find out which range is used for numeric comparisons let (_, num_range) = NumInfo::parse( &self.line[selection.clone()], - &NumInfoParseSettings { - accept_si_units: selector.settings.mode == SortMode::HumanNumeric, - ..Default::default() - }, + &settings + .numeric_locale + .num_info_settings(selector.settings.mode == SortMode::HumanNumeric), ); let initial_selection = selection.clone(); @@ -716,24 +774,50 @@ impl<'a> Line<'a> { } /// Tokenize a line into fields. The result is stored into `token_buffer`. -fn tokenize(line: &[u8], separator: Option, token_buffer: &mut Vec) { +fn tokenize( + line: &[u8], + separator: Option, + token_buffer: &mut Vec, + precomputed: &Precomputed, +) { assert!(token_buffer.is_empty()); if let Some(separator) = separator { tokenize_with_separator(line, separator, token_buffer); } else { - tokenize_default(line, token_buffer); + tokenize_default( + line, + token_buffer, + precomputed.tokenize_blank_thousands_sep, + precomputed.tokenize_allow_unit_after_blank, + ); } } /// By default fields are separated by the first whitespace after non-whitespace. /// Whitespace is included in fields at the start. /// The result is stored into `token_buffer`. -fn tokenize_default(line: &[u8], token_buffer: &mut Vec) { +fn tokenize_default( + line: &[u8], + token_buffer: &mut Vec, + blank_thousands_sep: bool, + allow_unit_after_blank: bool, +) { token_buffer.push(0..0); // pretend that there was whitespace in front of the line let mut previous_was_whitespace = true; for (idx, char) in line.iter().enumerate() { - if char.is_ascii_whitespace() { + let is_whitespace = char.is_ascii_whitespace(); + let treat_as_separator = if is_whitespace { + if blank_thousands_sep && *char == b' ' { + !is_blank_thousands_sep(line, idx, allow_unit_after_blank) + } else { + true + } + } else { + false + }; + + if treat_as_separator { if !previous_was_whitespace { token_buffer.last_mut().unwrap().end = idx; token_buffer.push(idx..0); @@ -746,6 +830,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec) { token_buffer.last_mut().unwrap().end = line.len(); } +fn is_blank_thousands_sep(line: &[u8], idx: usize, allow_unit_after_blank: bool) -> bool { + if line.get(idx) != Some(&b' ') { + return false; + } + + let prev_is_digit = idx + .checked_sub(1) + .and_then(|prev_idx| line.get(prev_idx)) + .is_some_and(u8::is_ascii_digit); + if !prev_is_digit { + return false; + } + + let next = line.get(idx + 1).copied(); + match next { + Some(c) if c.is_ascii_digit() => true, + Some(b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q') + if allow_unit_after_blank => + { + true + } + _ => false, + } +} + /// Split between separators. These separators are not included in fields. /// The result is stored into `token_buffer`. fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec) { @@ -943,7 +1052,12 @@ impl FieldSelector { /// Get the selection that corresponds to this selector for the line. /// If `needs_fields` returned false, tokens may be empty. - fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> { + fn get_selection<'a>( + &self, + line: &'a [u8], + tokens: &[Field], + numeric_locale: &NumericLocaleSettings, + ) -> Selection<'a> { // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector. let tokens = if self.needs_tokens { Some(tokens) @@ -955,10 +1069,7 @@ impl FieldSelector { // Parse NumInfo for this number. let (info, num_range) = NumInfo::parse( range_str, - &NumInfoParseSettings { - accept_si_units: self.settings.mode == SortMode::HumanNumeric, - ..Default::default() - }, + &numeric_locale.num_info_settings(self.settings.mode == SortMode::HumanNumeric), ); // Shorten the range to what we need to pass to numeric_str_cmp later. range_str = &range_str[num_range]; @@ -1067,6 +1178,41 @@ impl FieldSelector { } } +#[cfg(unix)] +fn detect_numeric_locale() -> NumericLocaleSettings { + unsafe { + libc::setlocale(libc::LC_NUMERIC, c"".as_ptr()); + let mut settings = NumericLocaleSettings::default(); + let conv = libc::localeconv(); + if conv.is_null() { + return settings; + } + + let decimal_ptr = (*conv).decimal_point; + if !decimal_ptr.is_null() { + let decimal_point = CStr::from_ptr(decimal_ptr).to_bytes(); + if decimal_point.len() == 1 { + settings.decimal_pt = Some(decimal_point[0]); + } + } + + let thousands_ptr = (*conv).thousands_sep; + if !thousands_ptr.is_null() { + let thousands_sep = CStr::from_ptr(thousands_ptr).to_bytes(); + if thousands_sep.len() == 1 { + settings.thousands_sep = Some(thousands_sep[0]); + } + } + + settings + } +} + +#[cfg(not(unix))] +fn detect_numeric_locale() -> NumericLocaleSettings { + NumericLocaleSettings::default() +} + /// Creates an `Arg` that conflicts with all other sort modes. fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg { Arg::new(mode) @@ -1274,7 +1420,10 @@ fn default_merge_batch_size() -> usize { #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let mut settings = GlobalSettings::default(); + let mut settings = GlobalSettings { + numeric_locale: detect_numeric_locale(), + ..Default::default() + }; let matches = uucore::clap_localization::handle_clap_result_with_exit_code( uu_app(), @@ -2278,7 +2427,8 @@ mod tests { fn tokenize_helper(line: &[u8], separator: Option) -> Vec { let mut buffer = vec![]; - tokenize(line, separator, &mut buffer); + let precomputed = Precomputed::default(); + tokenize(line, separator, &mut buffer, &precomputed); buffer }