feat: add czv, czv-wasm, and czv-python (init release)

This commit is contained in:
rzmk 2024-06-19 22:37:33 -04:00
commit 9799ab694b
No known key found for this signature in database
40 changed files with 70383 additions and 0 deletions

72
czv-python/.gitignore vendored Normal file
View file

@ -0,0 +1,72 @@
/target
# Byte-compiled / optimized / DLL files
__pycache__/
.pytest_cache/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
.venv/
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
include/
man/
venv/
*.egg-info/
.installed.cfg
*.egg
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
pip-selfcheck.json
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# Rope
.ropeproject
# Django stuff:
*.log
*.pot
.DS_Store
# Sphinx documentation
docs/_build/
# PyCharm
.idea/
# VSCode
.vscode/
# Pyenv
.python-version

16
czv-python/Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "czv-python"
version = "0.0.0"
authors = ["Mueez Khan"]
description = "Python library for performing CSV-related functions for data engineering and analysis."
repository = "https://github.com/rzmk/czv"
edition = "2021"
[lib]
crate-type = ["cdylib", "rlib"]
[dependencies]
anyhow = "1.0.86"
csv = "1.3.0"
pyo3 = { version = "0.21.2", features = ["extension-module"] }
thiserror = "1.0.61"

50
czv-python/README.md Normal file
View file

@ -0,0 +1,50 @@
# czv-python
Python library for [czv](https://github.com/rzmk/czv). czv is a library of utility functions for CSV-related data engineering and analysis tasks.
## Installation and example
```bash
pip install czv
```
```python
import czv
data = """fruits,price
apple,2.50
banana,3.00
strawberry,1.50"""
output = czv.row_count(data, False)
print(output)
```
## Development
You'll need to have [maturin](https://github.com/PyO3/maturin/) and [uv](https://github.com/astral-sh/uv) installed. Set up a local virtual environment in the `czv-python` folder by running:
```bash
uv venv
```
Make sure to activate the virtual environment (instructions should be provided in your terminal after running the previous command).
Once you've activated the virtual environment, install dependencies by running:
```bash
uv pip install -r requirements.txt
```
### Build package in local environment
```bash
maturin develop --uv --release
```
### Run tests
```bash
pytest
```

46
czv-python/czv.pyi Normal file
View file

@ -0,0 +1,46 @@
"""
# czv
Python library for [czv](https://github.com/rzmk/czv). CSV content manipulation and analysis.
## Install
```bash
pip install czv
```
## Example
```python
from czv import row_count
data = \"""fruits,price
apple,2.50
banana,3.00
strawberry,1.50\"""
output = row_count(data, False)
print(output)
```
"""
from typing import Optional
def row_count(file_data: str, include_header_row: Optional[bool]) -> int:
"""Returns a count of the total number of rows.
## Arguments
* `file_data` - CSV file data.
* `include_header_row` - Specify whether to include the header row (first row) in the row count. Default is false.
"""
def column_count(file_data: str) -> int:
"""Returns a count of the total number of columns (fields).
## Arguments
* `file_data` - CSV file data.
"""

View file

@ -0,0 +1,10 @@
import czv
data = """fruits,price
apple,2.50
banana,3.00
strawberry,1.50"""
output = czv.row_count(data, False)
print(output)

15
czv-python/pyproject.toml Normal file
View file

@ -0,0 +1,15 @@
[build-system]
requires = ["maturin>=1.6,<2.0"]
build-backend = "maturin"
[project]
name = "czv"
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dynamic = ["version"]
[tool.maturin]
features = ["pyo3/extension-module"]

View file

@ -0,0 +1,2 @@
maturin
pytest

33
czv-python/src/count.rs Normal file
View file

@ -0,0 +1,33 @@
use crate::Result;
use csv::ReaderBuilder;
use pyo3::pyfunction;
/// Returns a count of the total number of rows.
///
/// ## Arguments
///
/// * `file_data` - CSV file data.
/// * `include_header_row` - Specify whether to include the header row (first row) in the row count.
#[pyfunction]
pub fn row_count(file_data: String, include_header_row: Option<bool>) -> Result<usize> {
let mut rdr = ReaderBuilder::new();
rdr.has_headers(!include_header_row.unwrap_or(false));
return Ok(rdr.from_reader(file_data.as_bytes()).records().count());
}
/// Returns a count of the total number of columns (fields).
///
/// ## Arguments
///
/// * `file_data` - CSV file data.
#[pyfunction]
pub fn column_count(file_data: Option<String>) -> Result<usize> {
let rdr = ReaderBuilder::new();
if let Some(file_data) = file_data {
return Ok(rdr.from_reader(file_data.as_bytes()).headers()?.len());
} else {
bail!("Could not determine a file path or file data for column_count_builder.");
}
}

43
czv-python/src/lib.rs Normal file
View file

@ -0,0 +1,43 @@
use pyo3::prelude::*;
// Error-handling helpers
#[derive(thiserror::Error, Debug)]
#[error("{0}")]
pub struct CzvError(anyhow::Error);
impl From<pyo3::PyErr> for CzvError {
fn from(value: pyo3::PyErr) -> Self {
value.into()
}
}
impl From<csv::Error> for CzvError {
fn from(value: csv::Error) -> Self {
value.into()
}
}
impl From<CzvError> for pyo3::PyErr {
fn from(value: CzvError) -> Self {
value.into()
}
}
pub type Result<T> = anyhow::Result<T, CzvError>;
#[allow(unused_macros)]
macro_rules! bail {
($err:expr $(,)?) => {
return Err(crate::CzvError(anyhow::anyhow!($err)))
};
}
// Command imports
pub mod count;
#[pymodule]
fn czv(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(count::row_count, m)?)?;
m.add_function(wrap_pyfunction!(count::column_count, m)?)?;
Ok(())
}

View file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,4 @@
fruit,price
apple,2.50
banana,3.00
strawberry,1.50
1 fruit price
2 apple 2.50
3 banana 3.00
4 strawberry 1.50

View file

@ -0,0 +1,24 @@
import czv
import pytest
from .test_data import test_data
class TestCountFunc:
@pytest.mark.parametrize(
"file_name,expected",
[("fruits.csv", 3), ("constituents_altnames.csv", 33971)],
)
def test_count(self, file_name, expected):
"""Count the total number of non-header rows."""
result = czv.row_count(test_data[file_name].read_text())
assert result == expected
@pytest.mark.parametrize(
"file_name,expected",
[("fruits.csv", 4), ("constituents_altnames.csv", 33972)],
)
def test_include_header_row(self, file_name, expected):
"""Count the total number of rows including the header row."""
result = czv.row_count(test_data[file_name].read_text(), include_header_row=True)
assert result == expected

View file

@ -0,0 +1,7 @@
import pathlib
data_path = pathlib.Path(__file__).parent.resolve().joinpath("data")
test_data = {
file_name: data_path.joinpath(file_name)
for file_name in ["fruits.csv", "constituents_altnames.csv"]
}