Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.

## [0.2.0] - 2024-11-14

### 🚀 Features

- Can now combine `Q` with `None` to yield `Q` again.
- Add `contains` operator to `C`, `Q` objects. This calls pandas' `str.contains` method.

### 🧪 Testing

- Fix wrong name in doctests

### Change

- [**breaking**] Add, rename, delete several methods:
Expand All @@ -14,6 +23,12 @@ All notable changes to this project will be documented in this file.
- added `get_content_file()` method to fetch and store remove content
- `load()` was renamed to `get_dataframe()`
- the `repo` argument was changed to `repo_name`
- *(utils)* [**breaking**] Rename `enhance` func to `infer_and_combine_levels`.

### Remove

- [**breaking**] Two unused funcs for markdown processing were removed
- *(load)* [**breaking**] Drop `join_datasets`, since it's not needed. All it did was run `pd.concat(...)`.

## [0.1.2] - 2024-10-31

Expand Down
6 changes: 2 additions & 4 deletions lydata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
from lydata.accessor import C, Q
from lydata.loader import (
available_datasets,
join_datasets,
load_datasets,
)
from lydata.utils import enhance
from lydata.utils import infer_and_combine_levels
from lydata.validator import validate_datasets

__author__ = "Roman Ludwig"
Expand All @@ -22,10 +21,9 @@
"Q",
"C",
"available_datasets",
"join_datasets",
"load_datasets",
"validate_datasets",
"enhance",
"infer_and_combine_levels",
]

logger = logging.getLogger(__name__)
Expand Down
64 changes: 52 additions & 12 deletions lydata/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,29 @@ def _get_all_true(df: pd.DataFrame) -> pd.Series:
class CombineQMixin:
"""Mixin class for combining queries."""

def __and__(self, other: QTypes) -> AndQ:
def __and__(self, other: QTypes | None) -> AndQ:
"""Combine two queries with a logical AND."""
other = other or NoneQ()
return AndQ(self, other)

def __or__(self, other: QTypes) -> OrQ:
def __or__(self, other: QTypes | None) -> OrQ:
"""Combine two queries with a logical OR."""
other = other or NoneQ()
return OrQ(self, other)

def __invert__(self) -> NotQ:
"""Negate the query."""
return NotQ(self)

def __eq__(self, value):
"""Check if two queries are equal."""
return (
isinstance(value, self.__class__)
and self.colname == value.colname
and self.operator == value.operator
and self.value == value.value
)


class Q(CombineQMixin):
"""Combinable query object for filtering a DataFrame.
Expand All @@ -92,12 +103,13 @@ class Q(CombineQMixin):
">=": lambda series, value: series >= value,
"!=": lambda series, value: series != value, # same as ~Q("col", "==", value)
"in": lambda series, value: series.isin(value), # value is a list
"contains": lambda series, value: series.str.contains(value), # value is a str
}

def __init__(
self,
column: str,
operator: Literal["==", "<", "<=", ">", ">=", "!=", "in"],
operator: Literal["==", "<", "<=", ">", ">=", "!=", "in", "contains"],
value: Any,
) -> None:
"""Create query object that can compare a ``column`` with a ``value``."""
Expand All @@ -111,7 +123,20 @@ def __repr__(self) -> str:
return f"Q({self.colname!r}, {self.operator!r}, {self.value!r})"

def execute(self, df: pd.DataFrame) -> pd.Series:
"""Return a boolean mask where the query is satisfied for ``df``."""
"""Return a boolean mask where the query is satisfied for ``df``.

>>> df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['foo', 'bar', 'baz']})
>>> Q('col1', '<=', 2).execute(df)
0 True
1 True
2 False
Name: col1, dtype: bool
>>> Q('col2', 'contains', 'ba').execute(df)
0 False
1 True
2 True
Name: col2, dtype: bool
"""
try:
colname = self._column_map.from_short[self.colname].long
except KeyError:
Expand All @@ -128,19 +153,19 @@ def execute(self, df: pd.DataFrame) -> pd.Series:
class AndQ(CombineQMixin):
"""Query object for combining two queries with a logical AND.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q1 = Q('col1', '>', 1)
>>> q2 = Q('col1', '<', 3)
>>> df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['foo', 'bar', 'baz']})
>>> q1 = Q('col1', '!=', 3)
>>> q2 = Q('col2', 'contains', 'ba')
>>> and_q = q1 & q2
>>> print(and_q)
Q('col1', '>', 1) & Q('col1', '<', 3)
Q('col1', '!=', 3) & Q('col2', 'contains', 'ba')
>>> isinstance(and_q, AndQ)
True
>>> and_q.execute(df)
0 False
1 True
2 False
Name: col1, dtype: bool
dtype: bool
"""

def __init__(self, q1: QTypes, q2: QTypes) -> None:
Expand Down Expand Up @@ -243,9 +268,16 @@ class C:
whether the column name is valid. This is only done when the query is executed.
"""

def __init__(self, column: str) -> None:
"""Create a column object for comparison."""
self.column = column
def __init__(self, *column: str) -> None:
"""Create a column object for comparison.

For querying multi-level columns, both the syntax ``C('col1', 'col2')`` and
``C(('col1', 'col2'))`` are valid.

>>> (C('col1', 'col2') == 1) == (C(('col1', 'col2')) == 1)
True
"""
self.column = column[0] if len(column) == 1 else column

def __eq__(self, value: Any) -> Q:
"""Create a query object for comparing equality.
Expand Down Expand Up @@ -303,6 +335,14 @@ def isin(self, value: list[Any]) -> Q:
"""
return Q(self.column, "in", value)

def contains(self, value: str) -> Q:
"""Create a query object for checking if the column values contain a string.

>>> C('foo').contains('bar')
Q('foo', 'contains', 'bar')
"""
return Q(self.column, "contains", value)


@dataclass
class QueryPortion:
Expand Down
100 changes: 10 additions & 90 deletions lydata/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,14 @@
import logging
import os
import warnings
from collections.abc import Generator, Iterable
from collections.abc import Generator
from datetime import datetime
from io import TextIOWrapper
from pathlib import Path

import mistletoe
import numpy as np # noqa: F401
import pandas as pd
from github import Auth, Github, Repository
from github.ContentFile import ContentFile
from mistletoe.block_token import Heading
from mistletoe.markdown_renderer import MarkdownRenderer
from mistletoe.token import Token
from pydantic import BaseModel, Field, PrivateAttr, constr

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -177,62 +172,21 @@ def get_dataframe(
kwargs = {"header": [0, 1, 2]}
kwargs.update(load_kwargs)

try:
if use_github:
logger.info(f"Skipping loading from {self.path_on_disk}.")
raise SkipDiskError
df = pd.read_csv(self.path_on_disk, **kwargs)

except (FileNotFoundError, pd.errors.ParserError, SkipDiskError) as err:
if isinstance(err, FileNotFoundError | pd.errors.ParserError):
logger.info(
f"Could not load from {self.path_on_disk}. Trying GitHub..."
)

download_url = self.get_content_file(
if use_github:
msg = f"Trying to load dataset {self.name} from GitHub."
from_location = self.get_content_file(
token=token, user=user, password=password
).download_url
df = pd.read_csv(download_url, **kwargs)
else:
msg = f"Trying to load dataset {self.name} from disk."
from_location = self.path_on_disk

logger.info(msg)
df = pd.read_csv(from_location, **kwargs)
df.attrs.update(self.model_dump())
return df


def remove_subheadings(tokens: Iterable[Token], min_level: int = 1) -> list[Token]:
"""Remove anything under ``min_level`` headings.

With this, one can truncate markdown content to e.g. to the top-level heading and
the text that follows immediately after. Any subheadings after that will be removed.
"""
for i, token in enumerate(tokens):
if isinstance(token, Heading) and token.level > min_level:
return tokens[:i]

return list(tokens)


def format_description(
readme: TextIOWrapper | str,
short: bool = False,
max_line_length: int = 60,
) -> str:
"""Get a markdown description from a file.

Truncate the description before the first second-level heading if ``short``
is set to ``True``.
"""
with MarkdownRenderer(
max_line_length=max_line_length,
normalize_whitespace=True,
) as renderer:
doc = mistletoe.Document(readme)

if short:
doc.children = remove_subheadings(doc.children, min_level=1)

return renderer.render(doc)


def _available_datasets_on_disk(
year: int | str = "*",
institution: str = "*",
Expand All @@ -245,7 +199,7 @@ def _available_datasets_on_disk(
for search_path in search_paths:
for match in search_path.glob(pattern):
if match.is_dir() and (match / "data.csv").exists():
year, institution, subsite = match.name.split("-")
year, institution, subsite = match.name.split("-", maxsplit=2)
yield LyDataset(
year=year,
institution=institution,
Expand Down Expand Up @@ -401,40 +355,6 @@ def load_datasets(
yield dset_conf.get_dataframe(use_github=use_github, **kwargs)


def join_datasets(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo_name: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> pd.DataFrame:
"""Join matching datasets from the disk.

This uses the :py:func:`.load_datasets` function to load the datasets and then
concatenates them along the index axis. All arguments are also directly passed to
the :py:func:`.load_datasets` function.

>>> join_datasets(year="2023").shape
(705, 219)
>>> join_datasets(year="2023", use_github=True).shape
(705, 219)
"""
gen = load_datasets(
year=year,
institution=institution,
subsite=subsite,
search_paths=search_paths,
use_github=use_github,
repo_name=repo_name,
ref=ref,
**kwargs,
)
return pd.concat(list(gen), axis="index", ignore_index=True)


def _run_doctests() -> None:
"""Run the doctests."""
import doctest
Expand Down
2 changes: 1 addition & 1 deletion lydata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def infer_all_levels(
return result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))


def enhance(
def infer_and_combine_levels(
dataset: pd.DataFrame,
infer_sublevels_kwargs: dict[str, Any] | None = None,
infer_superlevels_kwargs: dict[str, Any] | None = None,
Expand Down
Loading