Source code for nomenclature.definition

import logging
from datetime import datetime
from pathlib import Path

import git
import pandas as pd
from pyam import IamDataFrame
from pyam.index import replace_index_labels
from pyam.utils import adjust_log_level, write_sheet

from nomenclature.codelist import (
    CodeList,
    MetaCodeList,
    RegionCodeList,
    ScenarioCodeList,
    VariableCodeList,
)
from nomenclature.config import NomenclatureConfig
from nomenclature.exceptions import (
    NomenclatureValidationError,
    TimeDomainError,
    UnknownCodeError,
    WrongUnitError,
)

logger = logging.getLogger(__name__)

SPECIAL_CODELIST = {
    "variable": VariableCodeList,
    "region": RegionCodeList,
    "scenario": ScenarioCodeList,
    "meta": MetaCodeList,
}



[docs]
class DataStructureDefinition:
    """
    Loads and manages codelists for IAMC data dimensions (e.g., variable, region, scenario).

    This class reads project definitions from a folder, initializes codelists for each dimension,
    and provides methods to validate scenario data, check aggregation consistency, and export
    codelists to Excel.
    """

    def __init__(self, path, dimensions=None):
        """
        Parameters
        ----------
        path : str or path-like
            The folder with the project definitions.
        dimensions : list of str, optional
            List of :meth:`CodeList` names. Each CodeList is initialized
            from a sub-folder of `path` of that name.
        """

        if not isinstance(path, Path):
            path = Path(path)

        self.project_folder = path.parent
        self.project = self.project_folder.name.split("-workflow")[0]

        if (file := self.project_folder / "nomenclature.yaml").exists():
            self.config = NomenclatureConfig.from_file(file=file)
        else:
            self.config = NomenclatureConfig()

        try:
            self.repo = git.Repo(self.project_folder)
        except git.InvalidGitRepositoryError:
            self.repo = None

        if not path.is_dir() and not (
            self.config.repositories
            or self.config.definitions.region.country
            or self.config.definitions.region.nuts
        ):
            raise NotADirectoryError(f"Definitions directory not found: {path}")

        self.dimensions = (
            ([dimensions] if isinstance(dimensions, str) else dimensions)
            or self.config.dimensions
            or [x.stem for x in path.iterdir() if x.is_dir()]
        )
        if not self.dimensions:
            raise ValueError("No dimensions specified.")

        for dim in self.dimensions:
            codelist_cls = SPECIAL_CODELIST.get(dim, CodeList)
            self.__setattr__(
                dim, codelist_cls.from_directory(dim, path / dim, self.config)
            )
            getattr(self, dim).check_illegal_characters(self.config)

        if empty := [d for d in self.dimensions if not getattr(self, d)]:
            raise ValueError(f"Empty codelist: {', '.join(empty)}")


[docs]
    def validate(self, df: IamDataFrame, dimensions: list | str | None = None) -> None:
        """Validate that the coordinates of `df` are defined in the codelists

        Parameters
        ----------
        df : :class:`pyam.IamDataFrame`
            Scenario data to be validated against the codelists of this instance.
        dimensions : list of str, str, optional
            Dimensions to perform validation (defaults to all dimensions of self)

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If `df` fails validation against any codelist.
        """

        exceptions: list[Exception] = []
        dimensions = [dimensions] if isinstance(dimensions, str) else dimensions
        for dimension in dimensions or self.dimensions:
            try:
                getattr(self, dimension).validate_df(
                    df,
                    dimension,
                    self.project,
                )
            except (UnknownCodeError, WrongUnitError) as e:
                exceptions.append(e)
        try:
            self.config.time_domain.validate_datetime(df)
        except* TimeDomainError as e:
            exceptions.append(e)
        if exceptions:
            raise NomenclatureValidationError("Validation failed, details:", exceptions)



[docs]
    def check_aggregate(self, df: IamDataFrame, **kwargs) -> pd.DataFrame:
        """Check for consistency of scenario data along the variable hierarchy

        Parameters
        ----------
        df : :class:`pyam.IamDataFrame`
            Scenario data to be checked for consistency along the variable hierarchy.
        kwargs : Tolerance arguments for comparison of values
            Passed to :any:`numpy.isclose` via :any:`pyam.IamDataFrame.check_aggregate`.

        Returns
        -------
        :class:`pandas.DataFrame`
            Data where a variable and its computed aggregate does not match.

        Raises
        ------
        ValueError
            If the :any:`DataStructureDefinition` does not have a *variable* dimension.
        """
        if "variable" not in self.dimensions:
            raise ValueError("Aggregation check requires 'variable' dimension.")

        lst = []

        with adjust_log_level(level="WARNING"):
            for code in df.variable:
                if code not in self.variable.mapping:
                    continue

                attr = self.variable.mapping[code]
                if attr.check_aggregate:
                    components = attr.components

                    # Check if multiple lists of components are given for a code
                    if isinstance(components, dict):
                        for name, _components in components.items():
                            error = df.check_aggregate(code, _components, **kwargs)
                            if error is not None:
                                error.dropna(inplace=True)
                                # Append components-name to variable column
                                error.index = replace_index_labels(
                                    error.index, "variable", [f"{code} [{name}]"]
                                )
                                lst.append(error)

                    # Else, use components provided as single list or pyam-default (None)
                    else:
                        error = df.check_aggregate(code, components, **kwargs)
                        if error is not None:
                            lst.append(error.dropna())

        if lst:
            # There may be empty dataframes due to `dropna()` above
            return pd.concat(lst)
        return pd.DataFrame()



[docs]
    def to_excel(self, excel_writer, **kwargs):
        """Write the codelists to an xlsx spreadsheet

        Parameters
        ----------
        excel_writer : str or :class:`pathlib.Path`
            File path as string or :class:`pathlib.Path`.
        **kwargs
            Passed to :class:`pandas.ExcelWriter`
        """
        if "engine" not in kwargs:
            kwargs["engine"] = "xlsxwriter"

        with pd.ExcelWriter(excel_writer, **kwargs) as writer:
            # Create dataframe with attributes of the DataStructureDefinition
            project = self.project_folder.absolute().parts[-1]
            arg_dict = {
                "project": project,
                "file_created": time_format(datetime.now()),
                "": "",
            }
            if self.repo is not None:
                arg_dict.update(git_attributes(project, self.repo))

            ret = make_dataframe(arg_dict)

            for key, value in self.config.repositories.items():
                ret = pd.concat(
                    [
                        ret,
                        make_dataframe(git_attributes(key, git.Repo(value.local_path))),
                    ]
                )

            write_sheet(writer, "project", ret)

            # Write codelist for each dimensions to own sheet
            for dim in self.dimensions:
                getattr(self, dim).to_excel(writer, dim, sort="asc")




def time_format(x):
    return x.strftime("%Y-%m-%d %H:%M:%S")


def git_attributes(name, repo):
    if repo.is_dirty():
        raise ValueError(f"Repository '{name}' is dirty")
    return {
        f"{name}.url": repo.remote().url,
        f"{name}.commit_hash": repo.commit(),
        f"{name}.commit_timestamp": time_format(repo.commit().committed_datetime),
    }


def make_dataframe(data):
    return (
        pd.DataFrame.from_dict(
            data,
            orient="index",
            columns=["value"],
        )
        .reset_index()
        .rename(columns={"index": "attribute"})
    )
Source code for nomenclature.definition

nomenclature

Navigation

Related Topics