lycosystem · rmnldwg · Nov 14, 2024 · Nov 8, 2024 · Nov 12, 2024 · Nov 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
 
 ## [0.2.0] - 2024-11-14
 
+### 🚀 Features
+
+- Can now combine `Q` with `None` to yield `Q` again.
+- Add `contains` operator to `C`, `Q` objects. This calls pandas' `str.contains` method.
+
+### 🧪 Testing
+
+- Fix wrong name in doctests
+
 ### Change
 
 - [**breaking**] Add, rename, delete several methods:
@@ -14,6 +23,12 @@ All notable changes to this project will be documented in this file.
   - added `get_content_file()` method to fetch and store remove content
   - `load()` was renamed to `get_dataframe()`
   - the `repo` argument was changed to `repo_name`
+- *(utils)* [**breaking**] Rename `enhance` func to `infer_and_combine_levels`.
+
+### Remove
+
+- [**breaking**] Two unused funcs for markdown processing were removed
+- *(load)* [**breaking**] Drop `join_datasets`, since it's not needed. All it did was run `pd.concat(...)`.
 
 ## [0.1.2] - 2024-10-31
 

diff --git a/lydata/__init__.py b/lydata/__init__.py
@@ -6,10 +6,9 @@
 from lydata.accessor import C, Q
 from lydata.loader import (
     available_datasets,
-    join_datasets,
     load_datasets,
 )
-from lydata.utils import enhance
+from lydata.utils import infer_and_combine_levels
 from lydata.validator import validate_datasets
 
 __author__ = "Roman Ludwig"
@@ -22,10 +21,9 @@
     "Q",
     "C",
     "available_datasets",
-    "join_datasets",
     "load_datasets",
     "validate_datasets",
-    "enhance",
+    "infer_and_combine_levels",
 ]
 
 logger = logging.getLogger(__name__)

diff --git a/lydata/accessor.py b/lydata/accessor.py
@@ -57,18 +57,29 @@ def _get_all_true(df: pd.DataFrame) -> pd.Series:
 class CombineQMixin:
     """Mixin class for combining queries."""
 
-    def __and__(self, other: QTypes) -> AndQ:
+    def __and__(self, other: QTypes | None) -> AndQ:
         """Combine two queries with a logical AND."""
+        other = other or NoneQ()
         return AndQ(self, other)
 
-    def __or__(self, other: QTypes) -> OrQ:
+    def __or__(self, other: QTypes | None) -> OrQ:
         """Combine two queries with a logical OR."""
+        other = other or NoneQ()
         return OrQ(self, other)
 
     def __invert__(self) -> NotQ:
         """Negate the query."""
         return NotQ(self)
 
+    def __eq__(self, value):
+        """Check if two queries are equal."""
+        return (
+            isinstance(value, self.__class__)
+            and self.colname == value.colname
+            and self.operator == value.operator
+            and self.value == value.value
+        )
+
 
 class Q(CombineQMixin):
     """Combinable query object for filtering a DataFrame.
@@ -92,12 +103,13 @@ class Q(CombineQMixin):
         ">=": lambda series, value: series >= value,
         "!=": lambda series, value: series != value,  # same as ~Q("col", "==", value)
         "in": lambda series, value: series.isin(value),  # value is a list
+        "contains": lambda series, value: series.str.contains(value),  # value is a str
     }
 
     def __init__(
         self,
         column: str,
-        operator: Literal["==", "<", "<=", ">", ">=", "!=", "in"],
+        operator: Literal["==", "<", "<=", ">", ">=", "!=", "in", "contains"],
         value: Any,
     ) -> None:
         """Create query object that can compare a ``column`` with a ``value``."""
@@ -111,7 +123,20 @@ def __repr__(self) -> str:
         return f"Q({self.colname!r}, {self.operator!r}, {self.value!r})"
 
     def execute(self, df: pd.DataFrame) -> pd.Series:
-        """Return a boolean mask where the query is satisfied for ``df``."""
+        """Return a boolean mask where the query is satisfied for ``df``.
+
+        >>> df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['foo', 'bar', 'baz']})
+        >>> Q('col1', '<=', 2).execute(df)
+        0     True
+        1     True
+        2    False
+        Name: col1, dtype: bool
+        >>> Q('col2', 'contains', 'ba').execute(df)
+        0    False
+        1     True
+        2     True
+        Name: col2, dtype: bool
+        """
         try:
             colname = self._column_map.from_short[self.colname].long
         except KeyError:
@@ -128,19 +153,19 @@ def execute(self, df: pd.DataFrame) -> pd.Series:
 class AndQ(CombineQMixin):
     """Query object for combining two queries with a logical AND.
 
-    >>> df = pd.DataFrame({'col1': [1, 2, 3]})
-    >>> q1 = Q('col1', '>', 1)
-    >>> q2 = Q('col1', '<', 3)
+    >>> df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['foo', 'bar', 'baz']})
+    >>> q1 = Q('col1', '!=', 3)
+    >>> q2 = Q('col2', 'contains', 'ba')
     >>> and_q = q1 & q2
     >>> print(and_q)
-    Q('col1', '>', 1) & Q('col1', '<', 3)
+    Q('col1', '!=', 3) & Q('col2', 'contains', 'ba')
     >>> isinstance(and_q, AndQ)
     True
     >>> and_q.execute(df)
     0    False
     1     True
     2    False
-    Name: col1, dtype: bool
+    dtype: bool
     """
 
     def __init__(self, q1: QTypes, q2: QTypes) -> None:
@@ -243,9 +268,16 @@ class C:
         whether the column name is valid. This is only done when the query is executed.
     """
 
-    def __init__(self, column: str) -> None:
-        """Create a column object for comparison."""
-        self.column = column
+    def __init__(self, *column: str) -> None:
+        """Create a column object for comparison.
+
+        For querying multi-level columns, both the syntax ``C('col1', 'col2')`` and
+        ``C(('col1', 'col2'))`` are valid.
+
+        >>> (C('col1', 'col2') == 1) == (C(('col1', 'col2')) == 1)
+        True
+        """
+        self.column = column[0] if len(column) == 1 else column
 
     def __eq__(self, value: Any) -> Q:
         """Create a query object for comparing equality.
@@ -303,6 +335,14 @@ def isin(self, value: list[Any]) -> Q:
         """
         return Q(self.column, "in", value)
 
+    def contains(self, value: str) -> Q:
+        """Create a query object for checking if the column values contain a string.
+
+        >>> C('foo').contains('bar')
+        Q('foo', 'contains', 'bar')
+        """
+        return Q(self.column, "contains", value)
+
 
 @dataclass
 class QueryPortion:

diff --git a/lydata/loader.py b/lydata/loader.py
@@ -24,19 +24,14 @@
 import logging
 import os
 import warnings
-from collections.abc import Generator, Iterable
+from collections.abc import Generator
 from datetime import datetime
-from io import TextIOWrapper
 from pathlib import Path
 
-import mistletoe
 import numpy as np  # noqa: F401
 import pandas as pd
 from github import Auth, Github, Repository
 from github.ContentFile import ContentFile
-from mistletoe.block_token import Heading
-from mistletoe.markdown_renderer import MarkdownRenderer
-from mistletoe.token import Token
 from pydantic import BaseModel, Field, PrivateAttr, constr
 
 logger = logging.getLogger(__name__)
@@ -177,62 +172,21 @@ def get_dataframe(
         kwargs = {"header": [0, 1, 2]}
         kwargs.update(load_kwargs)
 
-        try:
-            if use_github:
-                logger.info(f"Skipping loading from {self.path_on_disk}.")
-                raise SkipDiskError
-            df = pd.read_csv(self.path_on_disk, **kwargs)
-
-        except (FileNotFoundError, pd.errors.ParserError, SkipDiskError) as err:
-            if isinstance(err, FileNotFoundError | pd.errors.ParserError):
-                logger.info(
-                    f"Could not load from {self.path_on_disk}. Trying GitHub..."
-                )
-
-            download_url = self.get_content_file(
+        if use_github:
+            msg = f"Trying to load dataset {self.name} from GitHub."
+            from_location = self.get_content_file(
                 token=token, user=user, password=password
             ).download_url
-            df = pd.read_csv(download_url, **kwargs)
+        else:
+            msg = f"Trying to load dataset {self.name} from disk."
+            from_location = self.path_on_disk
 
+        logger.info(msg)
+        df = pd.read_csv(from_location, **kwargs)
         df.attrs.update(self.model_dump())
         return df
 
 
-def remove_subheadings(tokens: Iterable[Token], min_level: int = 1) -> list[Token]:
-    """Remove anything under ``min_level`` headings.
-
-    With this, one can truncate markdown content to e.g. to the top-level heading and
-    the text that follows immediately after. Any subheadings after that will be removed.
-    """
-    for i, token in enumerate(tokens):
-        if isinstance(token, Heading) and token.level > min_level:
-            return tokens[:i]
-
-    return list(tokens)
-
-
-def format_description(
-    readme: TextIOWrapper | str,
-    short: bool = False,
-    max_line_length: int = 60,
-) -> str:
-    """Get a markdown description from a file.
-
-    Truncate the description before the first second-level heading if ``short``
-    is set to ``True``.
-    """
-    with MarkdownRenderer(
-        max_line_length=max_line_length,
-        normalize_whitespace=True,
-    ) as renderer:
-        doc = mistletoe.Document(readme)
-
-        if short:
-            doc.children = remove_subheadings(doc.children, min_level=1)
-
-        return renderer.render(doc)
-
-
 def _available_datasets_on_disk(
     year: int | str = "*",
     institution: str = "*",
@@ -245,7 +199,7 @@ def _available_datasets_on_disk(
     for search_path in search_paths:
         for match in search_path.glob(pattern):
             if match.is_dir() and (match / "data.csv").exists():
-                year, institution, subsite = match.name.split("-")
+                year, institution, subsite = match.name.split("-", maxsplit=2)
                 yield LyDataset(
                     year=year,
                     institution=institution,
@@ -401,40 +355,6 @@ def load_datasets(
         yield dset_conf.get_dataframe(use_github=use_github, **kwargs)
 
 
-def join_datasets(
-    year: int | str = "*",
-    institution: str = "*",
-    subsite: str = "*",
-    search_paths: list[Path] | None = None,
-    use_github: bool = False,
-    repo_name: str = _default_repo_name,
-    ref: str = "main",
-    **kwargs,
-) -> pd.DataFrame:
-    """Join matching datasets from the disk.
-
-    This uses the :py:func:`.load_datasets` function to load the datasets and then
-    concatenates them along the index axis. All arguments are also directly passed to
-    the :py:func:`.load_datasets` function.
-
-    >>> join_datasets(year="2023").shape
-    (705, 219)
-    >>> join_datasets(year="2023", use_github=True).shape
-    (705, 219)
-    """
-    gen = load_datasets(
-        year=year,
-        institution=institution,
-        subsite=subsite,
-        search_paths=search_paths,
-        use_github=use_github,
-        repo_name=repo_name,
-        ref=ref,
-        **kwargs,
-    )
-    return pd.concat(list(gen), axis="index", ignore_index=True)
-
-
 def _run_doctests() -> None:
     """Run the doctests."""
     import doctest

diff --git a/lydata/utils.py b/lydata/utils.py
@@ -142,7 +142,7 @@ def infer_all_levels(
     return result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))
 
 
-def enhance(
+def infer_and_combine_levels(
     dataset: pd.DataFrame,
     infer_sublevels_kwargs: dict[str, Any] | None = None,
     infer_superlevels_kwargs: dict[str, Any] | None = None,