Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

All notable changes to this project will be documented in this file.

## [0.2.0] - 2024-11-14

### Change

- [**breaking**] Add, rename, delete several methods:
- `LyDatasetConfig` is now just `LyDataset`
- the `path` property is now `path_on_disk`
- the `get_url()` method has been removed
- the `get_description()` method has been removed
- added `get_content_file()` method to fetch and store remove content
- `load()` was renamed to `get_dataframe()`
- the `repo` argument was changed to `repo_name`

## [0.1.2] - 2024-10-31

### 🐛 Bug Fixes
Expand Down Expand Up @@ -153,6 +166,7 @@ Initial implementation of the lyDATA library.
<!-- generated by git-cliff -->
<!-- markdownlint-disable-file MD024 -->

[0.2.0]: https://github.com/rmnldwg/lydata/compare/0.1.2..0.2.0
[0.1.2]: https://github.com/rmnldwg/lydata/compare/0.1.1..0.1.2
[0.1.1]: https://github.com/rmnldwg/lydata/compare/0.1.0..0.1.1
[0.1.0]: https://github.com/rmnldwg/lydata/compare/0.0.4..0.1.0
Expand Down
144 changes: 68 additions & 76 deletions lydata/loader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
"""Provides functions to easily load lyDATA CSV tables as :py:class:`pandas.DataFrame`.

The loading itself is implemented in the :py:class:`.LyDatasetConfig` class, which
The loading itself is implemented in the :py:class:`.LyDataset` class, which
is a :py:class:`pydantic.BaseModel` subclass. It validates the unique specification
that identifies a dataset and then allows loading it from the disk (if present) or
from GitHub.

The :py:func:`available_datasets` function can be used to create a generator of such
:py:class:`.LyDatasetConfig` instances, corresponding to all available datasets that
:py:class:`.LyDataset` instances, corresponding to all available datasets that
are either found on disk or on GitHub.

Consequently, the :py:func:`load_datasets` function can be used to load all datasets
matching the given specs/pattern. It takes the same arguments as the function
:py:func:`available_datasets` but returns a generator of :py:class:`pandas.DataFrame`
instead of :py:class:`.LyDatasetConfig`.
instead of :py:class:`.LyDataset`.

Lastly, with the :py:func:`join_datasets` function, one can load and concatenate all
datasets matching the given specs/pattern into a single :py:class:`pandas.DataFrame`.
Expand All @@ -33,10 +33,11 @@
import numpy as np # noqa: F401
import pandas as pd
from github import Auth, Github, Repository
from github.ContentFile import ContentFile
from mistletoe.block_token import Heading
from mistletoe.markdown_renderer import MarkdownRenderer
from mistletoe.token import Token
from pydantic import BaseModel, Field, constr
from pydantic import BaseModel, Field, PrivateAttr, constr

logger = logging.getLogger(__name__)
_default_repo_name = "rmnldwg/lydata"
Expand All @@ -47,7 +48,7 @@ class SkipDiskError(Exception):
"""Raised when the user wants to skip loading from disk."""


class LyDatasetConfig(BaseModel):
class LyDataset(BaseModel):
"""Specification of a dataset."""

year: int = Field(
Expand All @@ -58,7 +59,9 @@ class LyDatasetConfig(BaseModel):
institution: low_min1_str = Field(
description="Institution's short code. E.g., University Hospital Zurich: `usz`."
)
subsite: low_min1_str = Field(description="Subsite(s) this dataset covers.")
subsite: low_min1_str = Field(
description="Tumor subsite(s) patients in this dataset were diagnosed with.",
)
repo_name: low_min1_str = Field(
default=_default_repo_name,
description="GitHub `repository/owner`.",
Expand All @@ -67,44 +70,29 @@ class LyDatasetConfig(BaseModel):
default="main",
description="Branch/tag/commit of the repo.",
)
_content_file: ContentFile | None = PrivateAttr(default=None)

@property
def name(self) -> str:
"""Get the name of the dataset.

>>> conf = LyDatasetConfig(year=2023, institution="clb", subsite="multisite")
>>> conf = LyDataset(year=2023, institution="clb", subsite="multisite")
>>> conf.name
'2023-clb-multisite'
"""
return f"{self.year}-{self.institution}-{self.subsite}"

@property
def path(self) -> Path:
def path_on_disk(self) -> Path:
"""Get the path to the dataset.

>>> conf = LyDatasetConfig(year="2021", institution="usz", subsite="oropharynx")
>>> conf.path.exists()
>>> conf = LyDataset(year="2021", institution="usz", subsite="oropharynx")
>>> conf.path_on_disk.exists()
True
"""
install_loc = Path(__file__).parent.parent
return install_loc / self.name / "data.csv"

def get_url(self, file: str) -> str:
"""Get the URL to the dataset's directory, CSV file, or README file.

>>> LyDatasetConfig(
... year=2021,
... institution="clb",
... subsite="oropharynx",
... ref="6ac98d",
... ).get_url("data.csv")
'https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2021-clb-oropharynx/data.csv'
"""
return (
"https://raw.githubusercontent.com/"
f"{self.repo_name}/{self.ref}/{self.name}/"
) + file

def get_repo(
self,
token: str | None = None,
Expand All @@ -118,11 +106,10 @@ def get_repo(
environment variables ``GITHUB_TOKEN`` or ``GITHUB_USER`` and
``GITHUB_PASSWORD``.

>>> conf = LyDatasetConfig(
>>> conf = LyDataset(
... year=2021,
... institution="clb",
... subsite="oropharynx",
... repo="rmnldwg/lydata",
... )
>>> conf.get_repo().full_name == conf.repo_name
True
Expand All @@ -133,39 +120,41 @@ def get_repo(
gh = Github(auth=auth)
return gh.get_repo(self.repo_name)

def get_description(
def get_content_file(
self,
token: str | None = None,
user: str | None = None,
password: str | None = None,
) -> str:
"""Get the description of the dataset.
) -> ContentFile:
"""Get the GitHub content file of the data CSV.

First, try to load it from the ``README.md`` file that should sit right next to
the ``data.csv`` file. If that fails, try to look for the ``README.md`` file in
the GitHub repository.
This method always tries to fetch the most recent version of the file.

In the latter case, see :py:func:`.get_repo` for how to authenticate with
GitHub, if necessary.

>>> conf = LyDatasetConfig(year=2021, institution="clb", subsite="oropharynx")
>>> print(conf.get_description()) # doctest: +ELLIPSIS
# 2021 CLB Oropharynx
...
>>> conf = LyDataset(
... year=2023,
... institution="usz",
... subsite="hypopharynx-larynx",
... repo_name="rmnldwg/lydata.private",
... ref="2023-usz-hypopharynx-larynx",
... )
>>> conf.get_content_file()
ContentFile(path="2023-usz-hypopharynx-larynx/data.csv")
"""
readme_path = self.path.with_name("README.md")
if readme_path.exists():
with open(readme_path, encoding="utf-8") as readme:
return format_description(readme, short=True)
if self._content_file is not None:
if self._content_file.update():
logger.info(f"Content file of {self.name} was updated.")
return self._content_file

logger.info(f"Readme not found at {readme_path}. Searching on GitHub...")
repo = self.get_repo(token=token, user=user, password=password)
readme = repo.get_contents(f"{self.name}/README.md").decoded_content.decode()
return format_description(readme, short=True)
self._content_file = repo.get_contents(f"{self.name}/data.csv", ref=self.ref)
return self._content_file

def load(
def get_dataframe(
self,
use_github: bool = False,
token: str | None = None,
user: str | None = None,
password: str | None = None,
**load_kwargs,
) -> pd.DataFrame:
"""Load the ``data.csv`` file from disk or from GitHub.
Expand All @@ -177,11 +166,11 @@ def load(
in the :py:attr:`~pandas.DataFrame.attrs` attribute of the returned
:py:class:`~pandas.DataFrame`.

>>> conf = LyDatasetConfig(year=2021, institution="clb", subsite="oropharynx")
>>> df_from_disk = conf.load()
>>> conf = LyDataset(year=2021, institution="clb", subsite="oropharynx")
>>> df_from_disk = conf.get_dataframe()
>>> df_from_disk.shape
(263, 82)
>>> df_from_github = conf.load(use_github=True)
>>> df_from_github = conf.get_dataframe(use_github=True)
>>> np.all(df_from_disk.fillna(0) == df_from_github.fillna(0))
np.True_
"""
Expand All @@ -190,15 +179,20 @@ def load(

try:
if use_github:
logger.info(f"Skipping loading from {self.path}.")
logger.info(f"Skipping loading from {self.path_on_disk}.")
raise SkipDiskError
df = pd.read_csv(self.path, **kwargs)
df = pd.read_csv(self.path_on_disk, **kwargs)

except (FileNotFoundError, pd.errors.ParserError, SkipDiskError) as err:
if isinstance(err, FileNotFoundError | pd.errors.ParserError):
logger.info(f"Could not load from {self.path}. Trying GitHub...")
logger.info(
f"Could not load from {self.path_on_disk}. Trying GitHub..."
)

df = pd.read_csv(self.get_url("data.csv"), **kwargs)
download_url = self.get_content_file(
token=token, user=user, password=password
).download_url
df = pd.read_csv(download_url, **kwargs)

df.attrs.update(self.model_dump())
return df
Expand Down Expand Up @@ -244,15 +238,15 @@ def _available_datasets_on_disk(
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
) -> Generator[LyDatasetConfig, None, None]:
) -> Generator[LyDataset, None, None]:
pattern = f"{str(year)}-{institution}-{subsite}"
search_paths = search_paths or [Path(__file__).parent.parent]

for search_path in search_paths:
for match in search_path.glob(pattern):
if match.is_dir() and (match / "data.csv").exists():
year, institution, subsite = match.name.split("-")
yield LyDatasetConfig(
yield LyDataset(
year=year,
institution=institution,
subsite=subsite,
Expand Down Expand Up @@ -283,12 +277,12 @@ def _available_datasets_on_github(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
) -> Generator[LyDataset, None, None]:
gh = Github(auth=_get_github_auth())

repo = gh.get_repo(repo)
repo = gh.get_repo(repo_name)
contents = repo.get_contents(path="", ref=ref)

matches = []
Expand All @@ -300,7 +294,7 @@ def _available_datasets_on_github(

for match in matches:
year, institution, subsite = match.name.split("-", maxsplit=2)
yield LyDatasetConfig(
yield LyDataset(
year=year,
institution=institution,
subsite=subsite,
Expand All @@ -315,10 +309,10 @@ def available_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
"""Generate :py:class:`.LyDatasetConfig` instances of available datasets.
) -> Generator[LyDataset, None, None]:
"""Generate :py:class:`.LyDataset` instances of available datasets.

The arguments ``year``, ``institution``, and ``subsite`` represent glob patterns
and all datasets matching these patterns can be iterated over using the returned
Expand All @@ -340,7 +334,7 @@ def available_datasets(
'2023-clb-multisite',
'2023-isb-multisite']
>>> avail_gen = available_datasets(
... repo="rmnldwg/lydata.private",
... repo_name="rmnldwg/lydata.private",
... ref="2024-umcg-hypopharynx-larynx",
... use_github=True,
... )
Expand All @@ -355,11 +349,9 @@ def available_datasets(
... ref="6ac98d",
... use_github=True,
... )
>>> sorted([ds.get_url("") for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2024-hvh-oropharynx/']
"""
if not use_github:
if repo != _default_repo_name or ref != "main":
if repo_name != _default_repo_name or ref != "main":
warnings.warn(
"Parameters `repo` and `ref` are ignored, unless `use_github` "
"is set to `True`."
Expand All @@ -375,7 +367,7 @@ def available_datasets(
year=year,
institution=institution,
subsite=subsite,
repo=repo,
repo_name=repo_name,
ref=ref,
)

Expand All @@ -386,13 +378,13 @@ def load_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> Generator[pd.DataFrame, None, None]:
"""Load matching datasets from the disk.

It loads every dataset from the :py:class:`.LyDatasetConfig` instances generated by
It loads every dataset from the :py:class:`.LyDataset` instances generated by
the :py:func:`available_datasets` function, which also receives all arguments of
this function.
"""
Expand All @@ -402,11 +394,11 @@ def load_datasets(
subsite=subsite,
search_paths=search_paths,
use_github=use_github,
repo=repo,
repo_name=repo_name,
ref=ref,
)
for dset_conf in dset_confs:
yield dset_conf.load(use_github=use_github, **kwargs)
yield dset_conf.get_dataframe(use_github=use_github, **kwargs)


def join_datasets(
Expand All @@ -415,7 +407,7 @@ def join_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> pd.DataFrame:
Expand All @@ -436,7 +428,7 @@ def join_datasets(
subsite=subsite,
search_paths=search_paths,
use_github=use_github,
repo=repo,
repo_name=repo_name,
ref=ref,
**kwargs,
)
Expand Down
Loading
Loading