From ce260e94910322c900931564f1472b49a3597a5b Mon Sep 17 00:00:00 2001 From: rzmk <30333942+rzmk@users.noreply.github.com> Date: Thu, 20 Jun 2024 01:03:54 -0400 Subject: [PATCH] refactor: use czv in czv-python - Add `file_path` for czv-python count operations - Refactor tests based on file path instead of data - Use thiserror in czv - Update examples to reflect changes --- Cargo.lock | 2 ++ LICENSE | 0 README.md | 2 +- czv-python/Cargo.toml | 1 + czv-python/README.md | 18 ++++++------ czv-python/czv.pyi | 14 ++++----- czv-python/examples/simple_row_count.py | 4 +-- czv-python/src/count.rs | 38 +++++++++---------------- czv-python/src/lib.rs | 7 +++++ czv-python/tests/test_count.py | 4 +-- czv-wasm/README.md | 5 +++- czv/Cargo.toml | 3 +- czv/README.md | 30 +++++++++++++++---- czv/src/lib.rs | 6 ++-- 14 files changed, 79 insertions(+), 55 deletions(-) create mode 100644 LICENSE diff --git a/Cargo.lock b/Cargo.lock index 44a927b..99653e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -223,6 +223,7 @@ dependencies = [ "anyhow", "criterion", "csv", + "thiserror", "typed-builder", ] @@ -232,6 +233,7 @@ version = "0.0.0" dependencies = [ "anyhow", "csv", + "czv", "pyo3", "thiserror", ] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 932a884..508674a 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ apple,2.50 banana,3.00 strawberry,1.50""" -output = czv.row_count(data, False) +output = czv.row_count(file_data=data) print(output) ``` diff --git a/czv-python/Cargo.toml b/czv-python/Cargo.toml index 680aa27..1e30709 100644 --- a/czv-python/Cargo.toml +++ b/czv-python/Cargo.toml @@ -12,5 +12,6 @@ crate-type = ["cdylib", "rlib"] [dependencies] anyhow = "1.0.86" csv = "1.3.0" +czv = { path = "../czv" } pyo3 = { version = "0.21.2", features = ["extension-module"] } thiserror = "1.0.61" diff --git a/czv-python/README.md b/czv-python/README.md index b38f2cc..772ff4f 100644 --- a/czv-python/README.md +++ b/czv-python/README.md @@ -1,24 +1,26 @@ # czv-python -Python library for [czv](https://github.com/rzmk/czv). czv is a library of utility functions for CSV-related data engineering and analysis tasks. +Python library for [czv](https://github.com/rzmk/czv). czv is a library of CSV-related operations for data engineering and analysis tasks. + +- For a Rust library see [czv](https://github.com/rzmk/czv/tree/main/czv). +- For a WebAssembly (JavaScript, TypeScript) library see [czv-wasm](https://github.com/rzmk/czv/tree/main/czv-wasm). ## Installation and example +To install `czv`, run: + ```bash pip install czv ``` +Let's say we want to print the total number of rows in a 4x3 CSV file `fruits.csv` including the header row: + ```python import czv -data = """fruits,price -apple,2.50 -banana,3.00 -strawberry,1.50""" +output = czv.row_count(file_path="fruits.csv", include_header_row=True) -output = czv.row_count(data, False) - -print(output) +print(output) # 4 ``` ## Development diff --git a/czv-python/czv.pyi b/czv-python/czv.pyi index bdfbcb4..6e55dda 100644 --- a/czv-python/czv.pyi +++ b/czv-python/czv.pyi @@ -14,12 +14,7 @@ pip install czv ```python from czv import row_count -data = \"""fruits,price -apple,2.50 -banana,3.00 -strawberry,1.50\""" - -output = row_count(data, False) +output = row_count(file_path="fruits.csv") print(output) ``` @@ -27,20 +22,23 @@ print(output) """ from typing import Optional +from pathlib import Path -def row_count(file_data: str, include_header_row: Optional[bool]) -> int: +def row_count(file_path: Optional[Path], file_data: Optional[str], include_header_row: Optional[bool]) -> int: """Returns a count of the total number of rows. ## Arguments + * `file_path` - CSV file path. * `file_data` - CSV file data. * `include_header_row` - Specify whether to include the header row (first row) in the row count. Default is false. """ -def column_count(file_data: str) -> int: +def column_count(file_path: Optional[Path], file_data: Optional[str]) -> int: """Returns a count of the total number of columns (fields). ## Arguments + * `file_path` - CSV file path. * `file_data` - CSV file data. """ diff --git a/czv-python/examples/simple_row_count.py b/czv-python/examples/simple_row_count.py index 2729da4..623edde 100644 --- a/czv-python/examples/simple_row_count.py +++ b/czv-python/examples/simple_row_count.py @@ -5,6 +5,6 @@ apple,2.50 banana,3.00 strawberry,1.50""" -output = czv.row_count(data, False) +output = czv.row_count(file_data=data, include_header_row=True) -print(output) +print(output) # 4 diff --git a/czv-python/src/count.rs b/czv-python/src/count.rs index ab32ebd..674f165 100644 --- a/czv-python/src/count.rs +++ b/czv-python/src/count.rs @@ -1,33 +1,21 @@ use crate::Result; -use csv::ReaderBuilder; use pyo3::pyfunction; +use std::path::PathBuf; -/// Returns a count of the total number of rows. -/// -/// ## Arguments -/// -/// * `file_data` - CSV file data. -/// * `include_header_row` - Specify whether to include the header row (first row) in the row count. #[pyfunction] -pub fn row_count(file_data: String, include_header_row: Option) -> Result { - let mut rdr = ReaderBuilder::new(); - - rdr.has_headers(!include_header_row.unwrap_or(false)); - return Ok(rdr.from_reader(file_data.as_bytes()).records().count()); +pub fn row_count( + file_path: Option, + file_data: Option, + include_header_row: Option, +) -> Result { + Ok(czv::count::row_count( + file_path, + file_data, + include_header_row.unwrap_or(false), + )?) } -/// Returns a count of the total number of columns (fields). -/// -/// ## Arguments -/// -/// * `file_data` - CSV file data. #[pyfunction] -pub fn column_count(file_data: Option) -> Result { - let rdr = ReaderBuilder::new(); - - if let Some(file_data) = file_data { - return Ok(rdr.from_reader(file_data.as_bytes()).headers()?.len()); - } else { - bail!("Could not determine a file path or file data for column_count_builder."); - } +pub fn column_count(file_path: Option, file_data: Option) -> Result { + Ok(czv::count::column_count(file_path, file_data)?) } diff --git a/czv-python/src/lib.rs b/czv-python/src/lib.rs index e593c2a..4c401de 100644 --- a/czv-python/src/lib.rs +++ b/czv-python/src/lib.rs @@ -1,3 +1,4 @@ +use ::czv::CzvError as OGError; use pyo3::prelude::*; // Error-handling helpers @@ -5,6 +6,12 @@ use pyo3::prelude::*; #[error("{0}")] pub struct CzvError(anyhow::Error); +impl From for CzvError { + fn from(value: OGError) -> Self { + value.into() + } +} + impl From for CzvError { fn from(value: pyo3::PyErr) -> Self { value.into() diff --git a/czv-python/tests/test_count.py b/czv-python/tests/test_count.py index 7f8592e..c796835 100644 --- a/czv-python/tests/test_count.py +++ b/czv-python/tests/test_count.py @@ -10,7 +10,7 @@ class TestCountFunc: def test_count(self, file_name, expected): """Count the total number of non-header rows.""" - result = czv.row_count(test_data[file_name].read_text()) + result = czv.row_count(file_path=test_data[file_name]) assert result == expected @pytest.mark.parametrize( @@ -20,5 +20,5 @@ class TestCountFunc: def test_include_header_row(self, file_name, expected): """Count the total number of rows including the header row.""" - result = czv.row_count(test_data[file_name].read_text(), include_header_row=True) + result = czv.row_count(file_path=test_data[file_name], include_header_row=True) assert result == expected diff --git a/czv-wasm/README.md b/czv-wasm/README.md index 8bce1a8..d98c45a 100644 --- a/czv-wasm/README.md +++ b/czv-wasm/README.md @@ -1,6 +1,9 @@ # czv-wasm -WebAssembly (JavaScript and TypeScript) library for [czv](https://github.com/rzmk/czv). czv is a library of utility functions for CSV-related data engineering and analysis tasks. +WebAssembly (JavaScript and TypeScript) library for [czv](https://github.com/rzmk/czv). czv is a library of CSV-related operations for data engineering and analysis tasks. + +- For a Rust library see [czv](https://github.com/rzmk/czv/tree/main/czv). +- For a Python library see [czv-python](https://github.com/rzmk/czv/tree/main/czv-python). ## Installation and example diff --git a/czv/Cargo.toml b/czv/Cargo.toml index 330c776..d265888 100644 --- a/czv/Cargo.toml +++ b/czv/Cargo.toml @@ -6,12 +6,13 @@ description = "Rust library for performing CSV-related operations for data engin repository = "https://github.com/rzmk/czv" edition = "2021" license = "MIT OR Apache-2.0" -keywords = ["csv", "library"] +keywords = ["csv", "library", "data"] categories = ["text-processing"] [dependencies] anyhow = "1.0.86" csv = "1.3.0" +thiserror = "1.0.61" typed-builder = "0.18.2" [dev-dependencies] diff --git a/czv/README.md b/czv/README.md index 33f72e2..ade2591 100644 --- a/czv/README.md +++ b/czv/README.md @@ -1,11 +1,12 @@ # czv -Rust library for [czv](https://github.com/rzmk/czv). czv is a library of utility functions for CSV-related data engineering and analysis tasks. +Rust library for [czv](https://github.com/rzmk/czv). czv is a library of CSV-related operations for data engineering and analysis tasks. + +- For a WebAssembly (JavaScript, TypeScript) library see [czv-wasm](https://github.com/rzmk/czv/tree/main/czv-wasm). +- For a Python library see [czv-python](https://github.com/rzmk/czv/tree/main/czv-python). ## Usage -You must have [Rust](https://www.rust-lang.org/tools/install) and Cargo installed (Cargo may be additionally installed when you install Rust with `rustup`). - To install `czv`, run: ```bash @@ -18,10 +19,12 @@ Let's say we want to print the total number of rows in a 4x3 CSV file `fruits.cs use czv::{count::RowCount, Result}; fn main() -> Result<()> { - let data = r#"fruits,price + let data = "\ +fruits,price apple,2.50 banana,3.00 -strawberry,1.50"#; +strawberry,1.50 +"; let output = RowCount::new() .file_data(data) .include_header_row(true) @@ -54,3 +57,20 @@ cargo bench ``` For benchmarks we use [criterion.rs](https://github.com/bheisler/criterion.rs). + +## License + +Licensed under either of + +- Apache License, Version 2.0 + ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +- MIT license + ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. + +## Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted +for inclusion in the work by you, as defined in the Apache-2.0 license, shall be +dual licensed as above, without any additional terms or conditions. diff --git a/czv/src/lib.rs b/czv/src/lib.rs index 3c9e094..60d00bf 100644 --- a/czv/src/lib.rs +++ b/czv/src/lib.rs @@ -32,8 +32,10 @@ pub mod count; pub mod slice; #[allow(dead_code)] -#[derive(Debug)] -pub struct CzvError(anyhow::Error); +// Error-handling helpers +#[derive(thiserror::Error, Debug)] +#[error("{0}")] +pub struct CzvError(pub anyhow::Error); impl From for CzvError { fn from(value: anyhow::Error) -> Self {