Source code for modelrunner.run.results

"""
Classes that describe results of simulations of models

.. codeauthor:: David Zwicker <david.zwicker@ds.mpg.de>
"""

from __future__ import annotations

import collections
import inspect
import itertools
import logging
import warnings
from pathlib import Path
from typing import Any, Iterator, List

import numpy as np
from tqdm.auto import tqdm

from ..model.base import ModelBase
from ..storage import Location, StorageGroup, StorageID, open_storage, storage_actions
from ..storage.access_modes import ModeType
from ..storage.attributes import Attrs
from ..storage.utils import encode_class


[docs]class MockModel(ModelBase):
    """helper class to store parameter values when the original model is not present"""

    def __init__(self, parameters: dict[str, Any] | None = None):
        """
        Args:
            parameters (dict): A dictionary of parameters
        """
        self.parameters = self._parse_parameters(parameters, check_validity=False)

    def __call__(self):
        raise RuntimeError(f"{self.__class__.__name__} cannot be called")

    def __repr__(self):
        return f"{self.__class__.__name__}({self.parameters})"


[docs]class Result:
    """describes the result of a single model run together with auxillary information

    Besides storing the final outcome of the model in
    :attr:`~modelrunner.run.results.Result.result`, the class also stores information
    about the original model in :attr:`~modelrunner.run.results.Result.model`,
    additional information in :attr:`~modelrunner.run.results.Result.info`, and
    potentially arbitrary objects that were added during the model run in
    :attr:`~modelrunner.run.results.Result.storage`.

    .. note::
        The result is represented as a hierarchical structure when safed using the
        :mod:`~modelrunner.storage`. The actual result is stored in the `result` group,
        whereas the model information can be found in `_model` group. Additional
        information is stored in the root attribute. Thus, the full :class:`Result` can
        be read using :code:`storage[loc]`, where `loc` denotes the result location. If
        only the actual result is needed, :code:`storage[loc + "/result"]` can be read.
    """

    _format_version = 3
    """int: number indicating the version of the file format"""

    model: ModelBase
    """:class:`ModelBase`: Model that was run. This is a
    :class:`~modelrunner.run.results.MockModel` instance if details are not available"""
    result: Any
    """the final outcome of the model"""
    storage: StorageGroup | None
    """:class:`StorageGroup`: Storage that might contain additional information, e.g.,
    stored during the model run"""
    info: dict[str, Any] | None
    """dict: Additional information for this result"""

    def __init__(
        self,
        model: ModelBase,
        result: Any,
        *,
        storage: StorageGroup | None = None,
        info: dict[str, Any] | None = None,
    ):
        """
        Args:
            model (:class:`ModelBase`):
                The model from which the result was obtained
            result:
                The actual result
            storage:
                A storage containing additional data from the model run
            info (dict):
                Additional information for this result
        """
        if not isinstance(model, ModelBase):
            raise TypeError("The model should be of type `ModelBase`")
        self.result = result
        self.model = model
        self.storage = storage
        self.info: Attrs = {} if info is None else info

    @property
    def data(self):
        """direct access to the underlying state data"""
        # deprecated on 2024-04-13
        warnings.warn("`.data` attribute was renamed to `.result`", DeprecationWarning)
        return self.result

[docs]    @classmethod
    def from_data(
        cls,
        model_data: dict[str, Any],
        result,
        *,
        model: ModelBase | None = None,
        storage: StorageGroup | None = None,
        info: dict[str, Any] | None = None,
    ) -> Result:
        """create result from data

        Args:
            model_data (dict):
                The data identifying the model
            result:
                The actual result data
            model (:class:`ModelBase`):
                The model from which the result was obtained
            storage:
                A storage containing additional data from the model run
            info (dict):
                Additional information for this result

        Returns:
            :class:`Result`: The result object
        """
        if model is None:
            model_cls: type[ModelBase] = MockModel
        else:
            model_cls = model if inspect.isclass(model) else model.__class__

        if not model_data:
            warnings.warn("Model data not found")
        model = model_cls(model_data.get("parameters", {}))
        model.name = model_data.get("name")
        model.description = model_data.get("description")

        return cls(model, result, storage=storage, info=info)

    @property
    def parameters(self) -> dict[str, Any]:
        return self.model.parameters

[docs]    @classmethod
    def from_file(
        cls,
        storage: StorageID,
        loc: Location = None,
        *,
        model: ModelBase | None = None,
    ):
        """load object from a file

        This function loads the results from a hierachical storage. It also attempts to
        read information about the model that was used to create this result and
        additional data that might have been stored in a
        :attr:`~modelrunner.results.Result.storage` while the model was running.

        Args:
            store (str or :class:`zarr.Store`):
                Path or instance describing the storage, which is either a file path or
                a :class:`zarr.Storage`.
            loc:
                The location where the result is stored in the storage. This should
                rarely be modified.
            model (:class:`~modelrunner.model.ModelBase`):
                The model which lead to this result
        """
        if isinstance(storage, (str, Path)) and (isinstance(loc, str) or loc is None):
            # check whether the file was written with an old format version
            from .compatibility.triage import result_check_load_old_version

            result = result_check_load_old_version(Path(storage), loc=loc, model=model)
            if result is not None:
                return result  # Result created from old version

        # assume that file was written with latest format version
        with open_storage(storage, mode="read") as storage_obj:
            attrs = storage_obj.read_attrs(loc)
            format_version = attrs.pop("format_version", None)
            if format_version == cls._format_version:
                # current version of storing results
                if "storage" in storage_obj:
                    data_storage = open_storage(storage, loc="storage", mode="read")
                else:
                    data_storage = None
                return cls.from_data(
                    model_data=storage_obj.get("_model", {}),
                    result=storage_obj.read_item("result", use_class=False),
                    model=model,
                    storage=data_storage,
                    info=attrs.pop("info", {}),  # load additional info,
                )

            else:
                raise RuntimeError(f"Cannot read format version {format_version}")

[docs]    def to_file(
        self, storage: StorageID, loc: Location = None, *, mode: ModeType = "insert"
    ) -> None:
        """write the results to a file

        Note that this does only write the actual `results` but omits additional data
        that might have been stored in a storage that is associated with the results.

        Args:
            storage (:class:`StorageBase` or :class:`StorageGroup`):
                The storage where the group is defined. If this is a
                :class:`StorageGroup` itself, `loc` is interpreted relative to that
                group
            loc (str or list of str):
                Denotes the location (path) of the group within the storage
            mode (str or :class:`~modelrunner.storage.access_modes.ModeType`):
                The file mode with which the storage is accessed, which determines the
                allowed operations. Common options are "read", "full", "append", and
                "truncate".
        """
        with open_storage(storage, loc=loc, mode=mode) as storage_obj:
            # collect attributes from the result
            attrs: Attrs = {
                # "model": dict(self.model._state_attributes),
                "format_version": self._format_version,
                "__class__": encode_class(self.__class__),
            }
            if self.info:
                attrs["info"] = self.info
            # write the actual data
            storage_obj.write_attrs([], attrs=attrs)
            storage_obj.write_object("_model", dict(self.model._state_attributes))
            storage_obj.write_object("result", self.result)


storage_actions.register("read_item", Result, Result.from_file)
storage_actions.register(
    "write_item", Result, lambda store, loc, result: result.to_file(store, loc)
)


[docs]class ResultCollection(List[Result]):
    """represents a collection of results"""

[docs]    @classmethod
    def from_folder(
        cls,
        folder: str | Path,
        pattern: str = "*.*",
        model: ModelBase | None = None,
        *,
        strict: bool = False,
        progress: bool = False,
    ):
        """create results collection from a folder

        args:
            folder (str):
                Path to the folder that is scanned
            pattern (str):
                Filename pattern that is used to detect result files
            model (:class:`~modelrunner.model.ModelBase`):
                Base class from which models are initialized
            strict (bool):
                Whether to raise an exception or just emit a warning when a file cannot
                be read
            progress (bool):
                Flag indicating whether a progress bar is shown
        """
        logger = logging.getLogger(cls.__name__)

        folder = Path(folder)
        if not folder.is_dir():
            logger.warning(f"{folder} is not a directory")

        # iterate over all files and load them as a Result
        results = []
        for path in tqdm(list(folder.glob(pattern)), disable=not progress):
            if path.is_file():
                try:
                    result = Result.from_file(path, model=model)
                except Exception as err:
                    if strict:
                        err.args = (str(err) + f"\nError reading file `{path}`",)
                        raise
                    else:
                        logger.warning(f"Error reading file `{path}`")
                else:
                    results.append(result)

        # raise a warning if no results were detected
        if not results:
            if pattern == "*.*":
                logger.warning("Did not find any files")
            else:
                logger.warning(
                    f"Did not find any files. Is pattern `{pattern}` too restrictive?"
                )

        return cls(results)

    def __repr__(self):
        return f"{self.__class__.__name__}(<{len(self)} Results>)"

    __str__ = __repr__

    def __add__(self, other: ResultCollection) -> ResultCollection:  # type: ignore
        if isinstance(other, ResultCollection):
            return ResultCollection(super().__add__(other))

    @property
    def same_model(self) -> bool:
        """bool: flag determining whether all results are from the same model"""
        if len(self) < 2:
            return True
        model_cls = self[0].model.__class__
        keys = self[0].model.parameters.keys()

        return all(
            res.model.__class__ == model_cls and res.model.parameters.keys() == keys
            for res in self
        )

    @property
    def parameters(self) -> dict[str, set[Any]]:
        """dict: the parameter values in this result collection

        Note that parameters that are lists in the individual models are turned into
        tuples, so they can be handled efficiently, e.g., in sets.
        """
        params = collections.defaultdict(set)

        for result in self:
            for k, v in result.model.parameters.items():
                if isinstance(v, list):
                    v = tuple(v)  # work around to make lists hashable
                params[k].add(v)
        return dict(params)

    @property
    def constant_parameters(self) -> dict[str, Any]:
        """dict: the parameters that are constant in this result collection"""
        return {
            k: next(iter(v))  # get the single item from the set
            for k, v in self.parameters.items()
            if len(v) == 1
        }

    @property
    def varying_parameters(self) -> dict[str, list[Any]]:
        """dict: the parameters that vary in this result collection"""
        return {k: sorted(v) for k, v in self.parameters.items() if len(v) > 1}

[docs]    def get(self, **kwargs) -> Result:
        """return a single result with the given parameters

        Warning:
            If there are multiple results compatible with the specified parameters, only
            the first one is returned.

        Args:
            **kwargs: Specify parameter values of result that is returned

        Returns:
            :class:`Result`: A single result from the collection
        """
        # return the first result that matches the requirements
        for item in self:
            if all(item.parameters[k] == v for k, v in kwargs.items()):
                return item
        raise ValueError("Result not contained in collection")

[docs]    def filtered(self, **kwargs) -> ResultCollection:
        r"""return a subset of the results

        Args:
            **kwargs: Specify parameter values of results that are retained

        Returns:
            :class:`ResultColelction`: The filtered collection
        """
        # return a filtered result collection
        return self.__class__(
            item
            for item in self
            if all(item.parameters[k] == v for k, v in kwargs.items())
        )

[docs]    def groupby(self, *args) -> Iterator[tuple[dict[str, list[Any]], ResultCollection]]:
        r"""group results according to the given variables

        Args:
            *args: Specify parameters according to which the results are sorted

        Returns:
            generator that allows iterating over the groups. Each iteration returns a
            dictionary with the current parameters and the associated
            :class:`ResultCollection`.
        """
        group_values = [self.parameters[name] for name in args]

        for group_value in itertools.product(*group_values):
            group_parameters = dict(zip(args, group_value))
            subset = self.filtered(**group_parameters)
            if len(subset) > 0:
                yield group_parameters, subset

[docs]    def sorted(self, *args, reverse: bool = False) -> ResultCollection:
        r"""return a sorted version of the results

        Args:
            *args: Specify parameters according to which the results are sorted
            reverse (bool): If True, sort in descending order

        Returns:
            :class:`ResultColelction`: The filtered collection
        """

        def sort_func(item):
            """helper function for ordering the results"""
            return [item.parameters[name] for name in args]

        return self.__class__(sorted(self, key=sort_func, reverse=reverse))

[docs]    def remove_duplicates(self) -> ResultCollection:
        """remove duplicates in the result collection"""
        #  we cannot use a set for `seen`, since parameters might not always be hashable
        unique_results, seen = [], []
        for result in self:
            if result.parameters not in seen:
                unique_results.append(result)
                seen.append(result.parameters)
        return self.__class__(unique_results)

    @property
    def dataframe(self):
        """create a pandas dataframe summarizing the data"""
        # deprecated on 2023-10-19
        warnings.warn("Property `dataframe` deprecated; use method `as_dataframe`")
        return self.as_dataframe()

[docs]    def as_dataframe(self, *, enforce_same_model: bool = True):
        """create a pandas dataframe summarizing the data

        Args:
            enforce_same_model (bool):
                If True, forces all model results to derive from the same model
        """
        import pandas as pd

        if enforce_same_model and not self.same_model:
            raise RuntimeError("Results are not from the same model")

        def get_data(result):
            """helper function to extract the data"""
            df_data = result.parameters.copy()

            # try obtaining the name of the result
            if result.info.get("name"):
                df_data.setdefault("name", result.info["name"])
            elif hasattr(result.model, "name"):
                df_data.setdefault("name", result.model.name)

            # try interpreting the result data in a format understood by pandas
            data = result.result
            if np.isscalar(data):
                df_data["result"] = data
            elif isinstance(data, dict):
                for key, value in data.items():
                    if np.isscalar(value):
                        df_data[key] = value
                    elif isinstance(value, (list, tuple, np.ndarray)):
                        df_data[key] = np.asarray(value)
            else:
                raise RuntimeError("Do not know how to interpret result")
            return df_data

        return pd.DataFrame([get_data(result) for result in self])