8000 cut: add whitespace option for separating fields by TechHara · Pull Request #4232 · uutils/coreutils · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

cut: add whitespace option for separating fields #4232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jan 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/src/extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ We provide a simple implementation of `more`, which is not part of GNU
coreutils. We do not aim for full compatibility with the `more` utility from
`util-linux`. Features from more modern pagers (like `less` and `bat`) are
therefore welcomed.

## `cut`

`cut` can separate fields by whitespace (Space and Tab) with `-w` flag. This feature is adopted from [FreeBSD](https://www.freebsd.org/cgi/man.cgi?cut).
215 changes: 171 additions & 44 deletions src/uu/cut/src/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@ use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError};

use self::searcher::Searcher;
use self::whitespace_searcher::WhitespaceSearcher;
use uucore::ranges::Range;
use uucore::{format_usage, show, show_error, show_if_err};

mod searcher;
mod whitespace_searcher;

static NAME: &str = "cut";
static USAGE: &str =
"{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
static ABOUT: &str =
"Prints specified byte or field columns from each line of stdin or the input files";
static LONG_HELP: &str = "
Expand Down Expand Up @@ -85,6 +87,11 @@ static LONG_HELP: &str = "
--delimiter (-d) option. Setting the delimiter is optional.
If not set, a default delimiter of Tab will be used.

If the -w option is provided, fields will be separated by any number
of whitespace characters (Space and Tab). The output delimiter will
be a Tab unless explicitly specified. Only one of -d or -w option can be specified.
This is an extension adopted from FreeBSD.

Optionally Filter based on delimiter
If the --only-delimited (-s) flag is provided, only lines wh 8000 ich
contain the delimiter will be printed
Expand All @@ -111,8 +118,13 @@ struct Options {
zero_terminated: bool,
}

enum Delimiter {
Whitespace,
String(String), // FIXME: use char?
}

struct FieldOptions {
delimiter: String, // one char long, String because of UTF8 representation
delimiter: Delimiter,
out_delimiter: Option<String>,
only_delimited: bool,
zero_terminated: bool,
Expand Down Expand Up @@ -256,32 +268,24 @@ fn cut_fields_delimiter<R: Read>(
Ok(())
}

#[allow(clippy::cognitive_complexity)]
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter(
reader,
ranges,
&opts.delimiter,
opts.only_delimited,
newline_char,
o_delim,
);
}

fn cut_fields_whitespace<R: Read>(
reader: R,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = opts.delimiter.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable();
let mut delim_search = WhitespaceSearcher::new(line).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
if !opts.only_delimited {
if !only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
Expand All @@ -290,42 +294,54 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes

return Ok(true);
}

// The logic is identical to `cut_fields_delimiter` function above, which uses
// `Searcher` that iterates over and returns the first position of the delimiter character.
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
// delimiter character positions, since each delimiter sequence length can vary.
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
// current field is not in the range, so jump to the field corresponding to the
// beginning of the range if any
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, last)) => last,
None => break,
};
}

// at this point, current field is the first in the range
for _ in 0..=high - low {
// skip printing delimiter if this is the first matching field for this line
if print_delim {
out.write_all(out_delim.as_bytes())?;
} else {
break;
print_delim = true;
}
}

match delim_search.nth(high - low) {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
match delim_search.next() {
// print the current field up to the next whitespace
Some((first, last)) => {
let segment = &line[low_idx..first];

out.write_all(segment)?;
out.write_all(segment)?;

print_delim = true;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];
low_idx = last;
fields_pos = high + 1;
}
None => {
// this is the last field in the line, so print the rest
let segment = &line[low_idx..];

out.write_all(segment)?;
out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
break;
}
}
}

out.write_all(&[newline_char])?;
Ok(true)
});
Expand All @@ -337,6 +353,97 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
Ok(())
}

fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
match opts.delimiter {
Delimiter::Whitespace => cut_fields_whitespace(
reader,
ranges,
opts.only_delimited,
newline_char,
opts.out_delimiter.as_deref().unwrap_or("\t"),
),
Delimiter::String(ref delimiter) => {
if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter(
reader,
ranges,
delimiter,
opts.only_delimited,
newline_char,
o_delim,
);
}

let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = delimiter.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
if !opts.only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}

return Ok(true);
}

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
} else {
break;
}
}

match delim_search.nth(high - low) {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];

out.write_all(segment)?;

print_delim = true;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];

out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}

Ok(())
}
}
}

fn cut_files(mut filenames: Vec<String>, mode: &Mode) {
let mut stdin_read = false;

Expand Down Expand Up @@ -387,6 +494,7 @@ mod options {
pub const ZERO_TERMINATED: &str = "zero-terminated";
pub const ONLY_DELIMITED: &str = "only-delimited";
pub const OUTPUT_DELIMITER: &str = "output-delimiter";
pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited";
pub const COMPLEMENT: &str = "complement";
pub const FILE: &str = "file";
}
Expand Down Expand Up @@ -449,9 +557,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
};

let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);

match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
Some(_) if whitespace_delimited => {
Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
}
Some(mut delim) => {
// GNU's `cut` supports `-d=` to set the delimiter to `=`.
// Clap parsing is limited in this situation, see:
Expand All @@ -474,7 +586,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
Ok(Mode::Fields(
ranges,
FieldOptions {
delimiter: delim,
delimiter: Delimiter::String(delim),
out_delimiter: out_delim,
only_delimited,
zero_terminated,
Expand All @@ -485,7 +597,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
None => Ok(Mode::Fields(
ranges,
FieldOptions {
delimiter: "\t".to_owned(),
delimiter: match whitespace_delimited {
true => Delimiter::Whitespace,
false => Delimiter::String("\t".to_owned()),
},
out_delimiter: out_delim,
only_delimited,
zero_terminated,
Expand All @@ -508,6 +623,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
{
Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
}
Mode::Bytes(_, _) | Mode::Characters(_, _)
if matches.get_flag(options::WHITESPACE_DELIMITED) =>
{
Err("invalid input: The '-w' option only usable if printing a sequence of fields".into())
}
Mode::Bytes(_, _) | Mode::Characters(_, _)
if matches.get_flag(options::ONLY_DELIMITED) =>
{
Expand Down Expand Up @@ -563,6 +683,13 @@ pub fn uu_app() -> Command {
.help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
.value_name("DELIM"),
)
.arg(
Arg::new(options::WHITESPACE_DELIMITED)
.short('w')
.help("Use any number of whitespace (Space, Tab) to separate fields in the input source (FreeBSD extension).")
.value_name("WHITESPACE")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new(options::FIELDS)
.short('f')
Expand Down
Loading
0