import logging
from datetime import datetime
from pathlib import Path
import pandas as pd
import git
from pyam import IamDataFrame
from pyam.index import replace_index_labels
from pyam.logging import adjust_log_level
from pyam.utils import write_sheet
from nomenclature.codelist import (
CodeList,
RegionCodeList,
VariableCodeList,
MetaCodeList,
)
from nomenclature.config import NomenclatureConfig
from nomenclature.validation import validate
logger = logging.getLogger(__name__)
SPECIAL_CODELIST = {
"variable": VariableCodeList,
"region": RegionCodeList,
"meta": MetaCodeList,
}
[docs]
class DataStructureDefinition:
"""Definition of datastructure codelists for dimensions used in the IAMC format"""
def __init__(self, path, dimensions=None):
"""
Parameters
----------
path : str or path-like
The folder with the project definitions.
dimensions : list of str, optional
List of :meth:`CodeList` names. Each CodeList is initialized
from a sub-folder of `path` of that name.
"""
if not isinstance(path, Path):
path = Path(path)
self.project_folder = path.parent
if (file := self.project_folder / "nomenclature.yaml").exists():
self.config = NomenclatureConfig.from_file(file=file)
else:
self.config = NomenclatureConfig()
try:
self.repo = git.Repo(self.project_folder)
except git.InvalidGitRepositoryError:
self.repo = None
if not path.is_dir() and not (
self.config.repositories or self.config.definitions.region.country
):
raise NotADirectoryError(f"Definitions directory not found: {path}")
self.dimensions = dimensions or ["region", "variable"]
for dim in self.dimensions:
codelist_cls = SPECIAL_CODELIST.get(dim, CodeList)
self.__setattr__(
dim, codelist_cls.from_directory(dim, path / dim, self.config)
)
if empty := [d for d in self.dimensions if not getattr(self, d)]:
raise ValueError(f"Empty codelist: {', '.join(empty)}")
[docs]
def validate(self, df: IamDataFrame, dimensions: list = None) -> None:
"""Validate that the coordinates of `df` are defined in the codelists
Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be validated against the codelists of this instance.
dimensions : list of str, optional
Dimensions to perform validation (defaults to all dimensions of self)
Returns
-------
None
Raises
------
ValueError
If `df` fails validation against any codelist.
"""
validate(self, df, dimensions=dimensions or self.dimensions)
[docs]
def check_aggregate(self, df: IamDataFrame, **kwargs) -> None:
"""Check for consistency of scenario data along the variable hierarchy
Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be checked for consistency along the variable hierarchy.
kwargs : Tolerance arguments for comparison of values
Passed to :any:`numpy.isclose` via :any:`pyam.IamDataFrame.check_aggregate`.
Returns
-------
:class:`pandas.DataFrame` or None
Data where a variable and its computed aggregate does not match.
Raises
------
ValueError
If the :any:`DataStructureDefinition` does not have a *variable* dimension.
"""
if "variable" not in self.dimensions:
raise ValueError("Aggregation check requires 'variable' dimension.")
lst = []
with adjust_log_level(level="WARNING"):
for code in df.variable:
attr = self.variable.mapping[code]
if attr.check_aggregate:
components = attr.components
# check if multiple lists of components are given for a code
if isinstance(components, dict):
for name, _components in components.items():
error = df.check_aggregate(code, _components, **kwargs)
if error is not None:
error.dropna(inplace=True)
# append components-name to variable column
error.index = replace_index_labels(
error.index, "variable", [f"{code} [{name}]"]
)
lst.append(error)
# else use components provided as single list or pyam-default (None)
else:
error = df.check_aggregate(code, components, **kwargs)
if error is not None:
lst.append(error.dropna())
if lst:
# there may be empty dataframes due to `dropna()` above
error = pd.concat(lst)
return error if not error.empty else None
[docs]
def to_excel(self, excel_writer, **kwargs):
"""Write the codelists to an xlsx spreadsheet
Parameters
----------
excel_writer : str or :class:`pathlib.Path`
File path as string or :class:`pathlib.Path`.
**kwargs
Passed to :class:`pandas.ExcelWriter`
"""
if "engine" not in kwargs:
kwargs["engine"] = "xlsxwriter"
with pd.ExcelWriter(excel_writer, **kwargs) as writer:
# create dataframe with attributes of the DataStructureDefinition
project = self.project_folder.absolute().parts[-1]
arg_dict = {
"project": project,
"file_created": time_format(datetime.now()),
"": "",
}
if self.repo is not None:
arg_dict.update(git_attributes(project, self.repo))
ret = make_dataframe(arg_dict)
for key, value in self.config.repositories.items():
ret = pd.concat(
[
ret,
make_dataframe(git_attributes(key, git.Repo(value.local_path))),
]
)
write_sheet(writer, "project", ret)
# write codelist for each dimensions to own sheet
for dim in self.dimensions:
getattr(self, dim).to_excel(writer, dim, sort_by_code=True)
def time_format(x):
return x.strftime("%Y-%m-%d %H:%M:%S")
def git_attributes(name, repo):
if repo.is_dirty():
raise ValueError(f"Repository '{name}' is dirty")
return {
f"{name}.url": repo.remote().url,
f"{name}.commit_hash": repo.commit(),
f"{name}.commit_timestamp": time_format(repo.commit().committed_datetime),
}
def make_dataframe(data):
return (
pd.DataFrame.from_dict(
data,
orient="index",
columns=["value"],
)
.reset_index()
.rename(columns={"index": "attribute"})
)