Source code for remayn.result.result

import json
import pickle
import shutil
import time
from hashlib import md5
from pathlib import Path
from typing import Optional, Union
from uuid import uuid4

import numpy as np

from ..utils import NonDefaultStrMethodError, sanitize_json
from .result_data import ResultData


[docs]class Result:
    """Represents the result of a experiment.
    It contains the path where the experiment ResultData is stored, along with the
    experiment information.
    The ResultData is only loaded when needed to save memory and time.

    Attributes
    ----------
    base_path: str
        Base path where all the experiments are stored.
    id: str
        Unique identifier of the experiment.
    config: dict
        Dictionary containing the parameters used in the experiment. All the elements
        in the dictionary must be JSON serializable. Objects contained in this dict
        should implement a custom __str__ method.
    data\\_: Optional[ResultData]
        Contains the `ResultData` when loaded or None if it was not loaded yet.
        This attribute should not be accessed directly. Use get_result() instead to
        make sure that the ResultData is properly loaded before accessing it.
    data_md5sum_: Optional[str]
        md5sum of the ResultData file. It is None if the file was not loaded yet or if
        creating a new Result that was not saved yet.
    created_at_: Optional[float]
        Timestamp when the experiment was created. It is None if the experiment was not
        saved yet.
    updated_at_: Optional[float]
        Timestamp when the experiment was last updated. It is None if the experiment was
        not saved yet.
    """

    base_path: Path
    id: str
    config: Optional[dict]
    data_: Optional[ResultData]
    data_md5sum_: Optional[str]
    created_at: Optional[float]
    updated_at: Optional[float]

    def __init__(
        self,
        base_path: Union[str, Path],
        id: Optional[str] = None,
        config: Optional[dict] = None,
    ):
        """Initializes the Result object.
        By default, it does not load the whole ResultData.

        Parameters
        ----------
        base_path: Union[str, Path]
            Base path where all the experiments are stored.
        id: Optional[str]
            Unique identifier of the experiment. Will be used for the file names. If
            None, a new unique identifier will be generated.
        config: Optional[dict]
            Dictionary containing the parameters used in the experiment.
        """

        self.base_path = Path(base_path)
        if id is None:
            self.id = str(uuid4())
        else:
            self.id = id
        self.config = config
        self.data_ = None
        self.data_md5sum_ = None
        self.created_at = None
        self.updated_at = None

[docs]    def get_data_path(self):
        """Gets the path where the ResultData is stored.

        Returns
        -------
        Path
            Path where the ResultData is stored.
        """

        return self.base_path / f"{self.id}.pkl"

[docs]    def get_info_path(self):
        """Gets the path where the experiment information is stored.

        Returns
        -------
        Path
            Path where the experiment information is stored.
        """

        return self.base_path / f"{self.id}.json"

    def __str__(self):
        s = f"Config: {json.dumps(sanitize_json(self.config), indent=4)}"
        if self.data_ is None:
            s += f"""
Results info path: {self.get_info_path()} (data not loaded)
"""
        else:
            s += f"""
Results info path: {self.get_info_path()}
Results data file: {self.get_data_path()}

Targets shape: {self.data_.targets.shape if self.data_.targets is not None else 'N/A'}
Predictions shape: {self.data_.predictions.shape if self.data_.predictions is not None else 'N/A'}
Train targets shape: {self.data_.train_targets.shape if self.data_.train_targets is not None else 'N/A'}
Train predictions shape: {self.data_.train_predictions.shape if self.data_.train_predictions is not None else 'N/A'}
Val targets shape: {self.data_.val_targets.shape if self.data_.val_targets is not None else 'N/A'}
Val predictions shape: {self.data_.val_predictions.shape if self.data_.val_predictions is not None else 'N/A'}

Time: {self.data_.time if self.data_.time is not None else 'N/A'}
Train history: {self.data_.train_history if self.data_.train_history is not None else 'N/A'}
Val history: {self.data_.val_history if self.data_.val_history is not None else 'N/A'}
Best params: {self.data_.best_params if self.data_.best_params is not None else 'N/A'}
"""
        return s

    def __repr__(self):
        return self.__str__()

    def __eq__(self, other: Union["Result", dict]):
        """Compares two `Result` objects considering only their config.
        It returns False if the type of the other object is not a `Result` or a dict.

        Parameters
        ----------
        other: Union[Result, dict]
            The other Result object or a dictionary containing the config of the
            other experiment.

        Returns
        -------
        bool
            True if the configs are equal and False otherwise.
        """

        if not isinstance(other, (Result, dict)):
            return False

        return self.compare_config(other)

[docs]    def compare_config(self, other: Union["Result", dict]) -> bool:
        """Compare the config of this Result with the config of `other` Result. It
        returns True if the configs are equal and False otherwise.

        Parameters
        ----------
        other: Union[Result, dict]
            The other Result object or a dictionary containing the config of the
            other experiment.

        Returns
        -------
        bool
            True if the configs are equal and False otherwise.
        """

        if isinstance(other, Result):
            return sanitize_json(self.config) == sanitize_json(other.config)
        elif isinstance(other, dict):
            return sanitize_json(self.config) == sanitize_json(other)
        else:
            raise TypeError(
                f"Expected a Result or a dict, but got {type(other)} instead."
            )

[docs]    def load_data(self, force=False):
        """Load the `ResultData` from the disk.
        This method reads the `ResultData` from the disk and stores it in the `data\\_`
        attribute. It also checks the integrity of the pickle file using the md5sum.
        This method is called automatically by get_result() when the `ResultData` is
        needed. However, you can call it manually to force the loading of the file.
        If the file was already loaded, this method does nothing, unless force=True is
        passed as an argument.

        Parameters
        ----------
        force: bool, optional, default=False
            If True, the file will be loaded even if it was already loaded.

        Raises
        ------
        FileNotFoundError
            If the `ResultData` does not exist.
        ValueError
            If the md5sum of the file does not match the one stored in the experiment
            information.
        """

        if self.data_ is not None and not force:
            return

        data_path = self.get_data_path()

        if not data_path.exists():
            raise FileNotFoundError(
                f"ResultData {data_path} does not exist."
                " The experiment is incomplete!"
            )

        with open(data_path, "rb") as f:
            content = f.read()

        md5sum = md5(content).hexdigest()

        if md5sum != self.get_md5sum():
            raise ValueError(
                f"ResultData {data_path} integrity check failed."
                " The file may have been modified after the experiment."
            )

        data = pickle.loads(content)
        self.data_ = data

[docs]    def get_md5sum(self):
        """Gets the md5sum of the ResultData file, which is stored in the experiment
        information.

        Returns
        -------
        str
            The md5sum of the ResultData file.
        """

        return self.data_md5sum_

[docs]    def get_data(self, force_reload=False):
        """Gets the `ResultData` of the experiment. If it was not loaded yet, it loads it
        from the disk. If the file was already loaded, it returns the stored object.
        This method should be used to access the `ResultData` instead of accessing the
        `data\\_` attribute directly.

        Parameters
        ----------
        force_reload: bool, optional, default=False
            If True, the `ResultData` will be reloaded even if it was already loaded.
            If False, it will only load the `ResultData` when it has not been loaded yet.

        Returns
        -------
        ResultData
            The `ResultData` object containing the results of the experiment. None if the
            `ResultData` is empty.
        """

        if self.data_ is None or force_reload:
            self.load_data(force=force_reload)

        return self.data_

[docs]    def set_data(self, data: ResultData):
        """Sets the ResultData of the experiment.
        This method should be used to set the ResultData instead of setting the `data\\_`
        attribute directly.

        Parameters
        ----------
        data: ResultData
            The ResultData object containing the results of the experiment.
        """

        self.data_ = data

[docs]    def get_experiment_info(self):
        """Gets all the experiment info as a dictionary, including experiment config,
        timestamps, md5sum of the ResultData file and the path where the ResultData is
        stored.

        Returns
        -------
        dict
            Dictionary containing all the experiment information.
        """

        return {
            "config": self.config,
            "data_md5sum": self.get_md5sum(),
            "created_at": self.created_at,
            "updated_at": self.updated_at,
        }

[docs]    def save(self):
        """Saves this `Result` to the disk.
        It saves the experiment information in the info_path file and the
        `ResultData` in the experiment_info_['results_path'] pickle file.
        If the files already exist, they will be overwritten.
        If the directory where the files should be saved does not exist, it will be
        created.

        Returns
        -------
        Result
            The `Result` object itself.
        """

        info_path = self.get_info_path()
        data_path = self.get_data_path()

        current_time = time.time()

        # Set time stamps
        if self.created_at is not None:
            self.updated_at = current_time
        else:
            self.created_at = current_time
            self.updated_at = current_time

        # Create directory
        self.base_path.mkdir(parents=True, exist_ok=True)

        # Save ResultData
        with open(data_path, "wb") as f:
            pickle.dump(self.data_, f)

        # Update md5sum based on new ResultData file
        with open(data_path, "rb") as f:
            self.data_md5sum_ = md5(f.read()).hexdigest()

        # Save experiment info
        experiment_info = self.get_experiment_info()
        safe_info = None
        try:
            safe_info = sanitize_json(experiment_info, accept_default_str=False)
        except NonDefaultStrMethodError:
            # Remove pickle file if an error occurs
            data_path.unlink()

            raise ValueError(
                "Experiment info contains some fields that are subject to change when"
                " the experiment is loaded from disk. Please, make sure that all the"
                " elements within the config are JSON serializable and show a"
                f" deterministic representation.\n{safe_info=}"
            )

        with open(info_path, "w") as f:
            json.dump(safe_info, f, indent=4)

        return self

[docs]    def delete(self, missing_ok=False):
        """Deletes the experiment information file (json) and the ResultData file
        (pickle) from the disk.

        Parameters
        ----------
        missing_ok: bool, optional, default=False
            If True, the method will not raise an error if the files do not exist.

        Raises
        ------
        FileNotFoundError
            If the experiment information file or the `ResultData` file does not exist
            and `missing_ok` is False.

        Returns
        -------
        bool
            True if the files were deleted successfully.
        """

        info_path = self.get_info_path()
        results_path = self.get_data_path()

        info_path.unlink(missing_ok=missing_ok)
        results_path.unlink(missing_ok=missing_ok)

        return True

[docs]    @staticmethod
    def load(base_path: Union[str, Path], id: str) -> "Result":
        """Loads a `Result` from the disk.
        It loads the experiment information and the `ResultData` from the disk and
        creates a new `Result` object with the loaded data.

        Parameters
        ----------
        base_path: Union[str, Path]
            Base path where all the experiments are stored.
        id: str
            Unique identifier of the experiment.

        Returns
        -------
        Result
            A new `Result` object with the loaded data.

        Raises
        ------
        FileNotFoundError
            If the experiment information file does not exist.
        ValueError
            If the experiment information file is not a valid json file.
            If the experiment information file does not contain the 'config' key.
            If the experiment information file does not contain the 'data_md5sum' key.

        Examples
        --------
        >>> from remayn.result import Result
        >>> result = Result.load("./results", "123")
        """

        result = Result(base_path=base_path, id=id)

        info_path = result.get_info_path()

        try:
            with open(info_path, "r") as f:
                info = json.load(f)
        except FileNotFoundError:
            raise FileNotFoundError(
                f"Experiment information file {info_path} does not exist."
            )

        if "config" not in info:
            raise ValueError(
                f"Experiment information file {info_path} does not contain the 'config'"
                " key. It is not a valid experiment information file."
            )
        result.config = info["config"]

        if "data_md5sum" not in info:
            raise ValueError(
                f"Experiment information file {info_path} does not contain the"
                " 'data_md5sum' key. It is not a valid experiment information file."
            )
        result.data_md5sum_ = info["data_md5sum"]

        result.created_at = info["created_at"] if "created_at" in info else None
        result.updated_at = info["updated_at"] if "updated_at" in info else None

        return result

[docs]    def copy_to(self, base_path: Union[str, Path], new_id: Optional[str] = None):
        """Copies the experiment information and the `ResultData` to a new directory.
        It creates a new `Result` object with the same data and saves it in the new
        directory. The new `Result` can have a new unique identifier.

        Parameters
        ----------
        base_path: Union[str, Path]
            Base path where the new experiment will be stored. Must be different from the
            current base path.
        new_id: Optional[str], optional, default=None
            Unique identifier of the new experiment. If None, the same identifier will be
            used.

        Returns
        -------
        Result
            A new `Result` object with the same data saved in the new directory.

        Examples
        --------
        >>> from remayn.result import Result
        >>> result = Result.load("./results", "123")
        >>> new_result = result.copy_to("./new_results")
        """

        base_path = Path(base_path)

        if self.base_path == base_path:
            raise ValueError(
                "The new base path must be different from the current base path."
            )

        if new_id is None:
            new_id = self.id

        base_path.mkdir(parents=True, exist_ok=True)

        source_info_path = self.get_info_path()
        source_data_path = self.get_data_path()

        dest_info_path = base_path / f"{new_id}.json"
        dest_data_path = base_path / f"{new_id}.pkl"

        if source_info_path.exists():
            shutil.copy(source_info_path, dest_info_path)

            if source_data_path.exists():
                shutil.copy(source_data_path, dest_data_path)

            new_result = Result.load(base_path, new_id)
        else:
            new_result = Result(base_path, new_id, self.config)
            new_result.set_data(self.get_data())
            new_result.save()

        return new_result


[docs]def make_result(
    base_path: Union[str, Path],
    config: dict,
    targets: np.ndarray,
    predictions: np.ndarray,
    train_targets: Optional[np.ndarray] = None,
    train_predictions: Optional[np.ndarray] = None,
    val_targets: Optional[np.ndarray] = None,
    val_predictions: Optional[np.ndarray] = None,
    time: Optional[float] = None,
    train_history: Optional[np.ndarray] = None,
    val_history: Optional[np.ndarray] = None,
    best_params: Optional[dict] = None,
    best_model: Optional[object] = None,
):
    """Helper function to create a `Result` object with the given data.
    It creates a `Result` object and the associated `ResultData`. The `Result` and
    the `ResultData` are not saved in the disk. You can call the `save()` method to
    save them.

    Parameters
    ----------
    base_path: Union[str, Path]
        Path of the main directory that will contain this `Result` and all the other
        results related to these experiments.
    config: dict
        Dictionary containing the parameters used in the experiment.
    targets: np.ndarray
        Array containing the targets of the experiment. Any shape can be used.
    predictions: np.ndarray
        Array containing the predictions of the experiment. Any shape can be used.
    train_targets: Optional[np.ndarray], optional, default=None
        Array containing the training targets of the experiment. Any shape can be used.
    train_predictions: Optional[np.ndarray], optional, default=None
        Array containing the training predictions of the experiment. Any shape can be
        used.
    val_targets: Optional[np.ndarray], optional, default=None
        Array containing the validation targets of the experiment. Any shape can be
        used.
    val_predictions: Optional[np.ndarray], optional, default=None
        Array containing the validation predictions of the experiment. Any shape can be
        used.
    time: Optional[float], optional, default=None
        Time spent to run the experiment.
    train_history: Optional[np.ndarray], optional, default=None
        Array containing the training history recorded during the training process. It
        should be a 1D array with the value of the error on each iteration.
    val_history: Optional[np.ndarray], optional, default=None
        Array containing the validation history recorded during the training process. It
        should be a 1D array with the value of the error on each iteration.
    best_params: Optional[dict], optional, default=None
        Dictionary containing the best parameters found during the experiment. It can be
        used in case that the experiment employs a cross-validation process. It can be
        left as None if the experiment does not use a cross-validation process or the
        cross-validation process is splitted in different experiments.
    best_model: Optional[object], optional, default=None
        Best model found during the experiment.

    Returns
    -------
    Result
        A new `Result` object with the given data. The `Result` is not saved in the
        disk. You can call the `save()` method to save it.

    Examples
    --------
    >>> import numpy as np
    >>> from remayn.result import make_result
    >>> targets = np.array([1, 2, 3])
    >>> predictions = np.array([1.1, 2.2, 3.3])
    >>> config = {"model": "linear_regression"}
    >>> result = make_result("results", config, targets, predictions)
    >>> result.save()
    """

    # Create a new Result (empty id to create a new one)
    result = Result(base_path=base_path, config=config)

    result.set_data(
        ResultData(
            targets=targets,
            predictions=predictions,
            train_targets=train_targets,
            train_predictions=train_predictions,
            val_targets=val_targets,
            val_predictions=val_predictions,
            time=time,
            train_history=train_history,
            val_history=val_history,
            best_params=best_params,
            best_model=best_model,
        )
    )

    return result