Source code for remayn.result.result

import json
import pickle
import shutil
import time
from hashlib import md5
from pathlib import Path
from typing import Optional, Union
from uuid import uuid4

import numpy as np

from ..utils import NonDefaultStrMethodError, sanitize_json
from .result_data import ResultData


[docs]class Result: """Represents the result of a experiment. It contains the path where the experiment ResultData is stored, along with the experiment information. The ResultData is only loaded when needed to save memory and time. Attributes ---------- base_path: str Base path where all the experiments are stored. id: str Unique identifier of the experiment. config: dict Dictionary containing the parameters used in the experiment. All the elements in the dictionary must be JSON serializable. Objects contained in this dict should implement a custom __str__ method. data\\_: Optional[ResultData] Contains the `ResultData` when loaded or None if it was not loaded yet. This attribute should not be accessed directly. Use get_result() instead to make sure that the ResultData is properly loaded before accessing it. data_md5sum_: Optional[str] md5sum of the ResultData file. It is None if the file was not loaded yet or if creating a new Result that was not saved yet. created_at_: Optional[float] Timestamp when the experiment was created. It is None if the experiment was not saved yet. updated_at_: Optional[float] Timestamp when the experiment was last updated. It is None if the experiment was not saved yet. """ base_path: Path id: str config: Optional[dict] data_: Optional[ResultData] data_md5sum_: Optional[str] created_at: Optional[float] updated_at: Optional[float] def __init__( self, base_path: Union[str, Path], id: Optional[str] = None, config: Optional[dict] = None, ): """Initializes the Result object. By default, it does not load the whole ResultData. Parameters ---------- base_path: Union[str, Path] Base path where all the experiments are stored. id: Optional[str] Unique identifier of the experiment. Will be used for the file names. If None, a new unique identifier will be generated. config: Optional[dict] Dictionary containing the parameters used in the experiment. """ self.base_path = Path(base_path) if id is None: self.id = str(uuid4()) else: self.id = id self.config = config self.data_ = None self.data_md5sum_ = None self.created_at = None self.updated_at = None
[docs] def get_data_path(self): """Gets the path where the ResultData is stored. Returns ------- Path Path where the ResultData is stored. """ return self.base_path / f"{self.id}.pkl"
[docs] def get_info_path(self): """Gets the path where the experiment information is stored. Returns ------- Path Path where the experiment information is stored. """ return self.base_path / f"{self.id}.json"
def __str__(self): s = f"Config: {json.dumps(sanitize_json(self.config), indent=4)}" if self.data_ is None: s += f""" Results info path: {self.get_info_path()} (data not loaded) """ else: s += f""" Results info path: {self.get_info_path()} Results data file: {self.get_data_path()} Targets shape: {self.data_.targets.shape if self.data_.targets is not None else 'N/A'} Predictions shape: {self.data_.predictions.shape if self.data_.predictions is not None else 'N/A'} Train targets shape: {self.data_.train_targets.shape if self.data_.train_targets is not None else 'N/A'} Train predictions shape: {self.data_.train_predictions.shape if self.data_.train_predictions is not None else 'N/A'} Val targets shape: {self.data_.val_targets.shape if self.data_.val_targets is not None else 'N/A'} Val predictions shape: {self.data_.val_predictions.shape if self.data_.val_predictions is not None else 'N/A'} Time: {self.data_.time if self.data_.time is not None else 'N/A'} Train history: {self.data_.train_history if self.data_.train_history is not None else 'N/A'} Val history: {self.data_.val_history if self.data_.val_history is not None else 'N/A'} Best params: {self.data_.best_params if self.data_.best_params is not None else 'N/A'} """ return s def __repr__(self): return self.__str__() def __eq__(self, other: Union["Result", dict]): """Compares two `Result` objects considering only their config. It returns False if the type of the other object is not a `Result` or a dict. Parameters ---------- other: Union[Result, dict] The other Result object or a dictionary containing the config of the other experiment. Returns ------- bool True if the configs are equal and False otherwise. """ if not isinstance(other, (Result, dict)): return False return self.compare_config(other)
[docs] def compare_config(self, other: Union["Result", dict]) -> bool: """Compare the config of this Result with the config of `other` Result. It returns True if the configs are equal and False otherwise. Parameters ---------- other: Union[Result, dict] The other Result object or a dictionary containing the config of the other experiment. Returns ------- bool True if the configs are equal and False otherwise. """ if isinstance(other, Result): return sanitize_json(self.config) == sanitize_json(other.config) elif isinstance(other, dict): return sanitize_json(self.config) == sanitize_json(other) else: raise TypeError( f"Expected a Result or a dict, but got {type(other)} instead." )
[docs] def load_data(self, force=False): """Load the `ResultData` from the disk. This method reads the `ResultData` from the disk and stores it in the `data\\_` attribute. It also checks the integrity of the pickle file using the md5sum. This method is called automatically by get_result() when the `ResultData` is needed. However, you can call it manually to force the loading of the file. If the file was already loaded, this method does nothing, unless force=True is passed as an argument. Parameters ---------- force: bool, optional, default=False If True, the file will be loaded even if it was already loaded. Raises ------ FileNotFoundError If the `ResultData` does not exist. ValueError If the md5sum of the file does not match the one stored in the experiment information. """ if self.data_ is not None and not force: return data_path = self.get_data_path() if not data_path.exists(): raise FileNotFoundError( f"ResultData {data_path} does not exist." " The experiment is incomplete!" ) with open(data_path, "rb") as f: content = f.read() md5sum = md5(content).hexdigest() if md5sum != self.get_md5sum(): raise ValueError( f"ResultData {data_path} integrity check failed." " The file may have been modified after the experiment." ) data = pickle.loads(content) self.data_ = data
[docs] def get_md5sum(self): """Gets the md5sum of the ResultData file, which is stored in the experiment information. Returns ------- str The md5sum of the ResultData file. """ return self.data_md5sum_
[docs] def get_data(self, force_reload=False): """Gets the `ResultData` of the experiment. If it was not loaded yet, it loads it from the disk. If the file was already loaded, it returns the stored object. This method should be used to access the `ResultData` instead of accessing the `data\\_` attribute directly. Parameters ---------- force_reload: bool, optional, default=False If True, the `ResultData` will be reloaded even if it was already loaded. If False, it will only load the `ResultData` when it has not been loaded yet. Returns ------- ResultData The `ResultData` object containing the results of the experiment. None if the `ResultData` is empty. """ if self.data_ is None or force_reload: self.load_data(force=force_reload) return self.data_
[docs] def set_data(self, data: ResultData): """Sets the ResultData of the experiment. This method should be used to set the ResultData instead of setting the `data\\_` attribute directly. Parameters ---------- data: ResultData The ResultData object containing the results of the experiment. """ self.data_ = data
[docs] def get_experiment_info(self): """Gets all the experiment info as a dictionary, including experiment config, timestamps, md5sum of the ResultData file and the path where the ResultData is stored. Returns ------- dict Dictionary containing all the experiment information. """ return { "config": self.config, "data_md5sum": self.get_md5sum(), "created_at": self.created_at, "updated_at": self.updated_at, }
[docs] def save(self): """Saves this `Result` to the disk. It saves the experiment information in the info_path file and the `ResultData` in the experiment_info_['results_path'] pickle file. If the files already exist, they will be overwritten. If the directory where the files should be saved does not exist, it will be created. Returns ------- Result The `Result` object itself. """ info_path = self.get_info_path() data_path = self.get_data_path() current_time = time.time() # Set time stamps if self.created_at is not None: self.updated_at = current_time else: self.created_at = current_time self.updated_at = current_time # Create directory self.base_path.mkdir(parents=True, exist_ok=True) # Save ResultData with open(data_path, "wb") as f: pickle.dump(self.data_, f) # Update md5sum based on new ResultData file with open(data_path, "rb") as f: self.data_md5sum_ = md5(f.read()).hexdigest() # Save experiment info experiment_info = self.get_experiment_info() safe_info = None try: safe_info = sanitize_json(experiment_info, accept_default_str=False) except NonDefaultStrMethodError: # Remove pickle file if an error occurs data_path.unlink() raise ValueError( "Experiment info contains some fields that are subject to change when" " the experiment is loaded from disk. Please, make sure that all the" " elements within the config are JSON serializable and show a" f" deterministic representation.\n{safe_info=}" ) with open(info_path, "w") as f: json.dump(safe_info, f, indent=4) return self
[docs] def delete(self, missing_ok=False): """Deletes the experiment information file (json) and the ResultData file (pickle) from the disk. Parameters ---------- missing_ok: bool, optional, default=False If True, the method will not raise an error if the files do not exist. Raises ------ FileNotFoundError If the experiment information file or the `ResultData` file does not exist and `missing_ok` is False. Returns ------- bool True if the files were deleted successfully. """ info_path = self.get_info_path() results_path = self.get_data_path() info_path.unlink(missing_ok=missing_ok) results_path.unlink(missing_ok=missing_ok) return True
[docs] @staticmethod def load(base_path: Union[str, Path], id: str) -> "Result": """Loads a `Result` from the disk. It loads the experiment information and the `ResultData` from the disk and creates a new `Result` object with the loaded data. Parameters ---------- base_path: Union[str, Path] Base path where all the experiments are stored. id: str Unique identifier of the experiment. Returns ------- Result A new `Result` object with the loaded data. Raises ------ FileNotFoundError If the experiment information file does not exist. ValueError If the experiment information file is not a valid json file. If the experiment information file does not contain the 'config' key. If the experiment information file does not contain the 'data_md5sum' key. Examples -------- >>> from remayn.result import Result >>> result = Result.load("./results", "123") """ result = Result(base_path=base_path, id=id) info_path = result.get_info_path() try: with open(info_path, "r") as f: info = json.load(f) except FileNotFoundError: raise FileNotFoundError( f"Experiment information file {info_path} does not exist." ) if "config" not in info: raise ValueError( f"Experiment information file {info_path} does not contain the 'config'" " key. It is not a valid experiment information file." ) result.config = info["config"] if "data_md5sum" not in info: raise ValueError( f"Experiment information file {info_path} does not contain the" " 'data_md5sum' key. It is not a valid experiment information file." ) result.data_md5sum_ = info["data_md5sum"] result.created_at = info["created_at"] if "created_at" in info else None result.updated_at = info["updated_at"] if "updated_at" in info else None return result
[docs] def copy_to(self, base_path: Union[str, Path], new_id: Optional[str] = None): """Copies the experiment information and the `ResultData` to a new directory. It creates a new `Result` object with the same data and saves it in the new directory. The new `Result` can have a new unique identifier. Parameters ---------- base_path: Union[str, Path] Base path where the new experiment will be stored. Must be different from the current base path. new_id: Optional[str], optional, default=None Unique identifier of the new experiment. If None, the same identifier will be used. Returns ------- Result A new `Result` object with the same data saved in the new directory. Examples -------- >>> from remayn.result import Result >>> result = Result.load("./results", "123") >>> new_result = result.copy_to("./new_results") """ base_path = Path(base_path) if self.base_path == base_path: raise ValueError( "The new base path must be different from the current base path." ) if new_id is None: new_id = self.id base_path.mkdir(parents=True, exist_ok=True) source_info_path = self.get_info_path() source_data_path = self.get_data_path() dest_info_path = base_path / f"{new_id}.json" dest_data_path = base_path / f"{new_id}.pkl" if source_info_path.exists(): shutil.copy(source_info_path, dest_info_path) if source_data_path.exists(): shutil.copy(source_data_path, dest_data_path) new_result = Result.load(base_path, new_id) else: new_result = Result(base_path, new_id, self.config) new_result.set_data(self.get_data()) new_result.save() return new_result
[docs]def make_result( base_path: Union[str, Path], config: dict, targets: np.ndarray, predictions: np.ndarray, train_targets: Optional[np.ndarray] = None, train_predictions: Optional[np.ndarray] = None, val_targets: Optional[np.ndarray] = None, val_predictions: Optional[np.ndarray] = None, time: Optional[float] = None, train_history: Optional[np.ndarray] = None, val_history: Optional[np.ndarray] = None, best_params: Optional[dict] = None, best_model: Optional[object] = None, ): """Helper function to create a `Result` object with the given data. It creates a `Result` object and the associated `ResultData`. The `Result` and the `ResultData` are not saved in the disk. You can call the `save()` method to save them. Parameters ---------- base_path: Union[str, Path] Path of the main directory that will contain this `Result` and all the other results related to these experiments. config: dict Dictionary containing the parameters used in the experiment. targets: np.ndarray Array containing the targets of the experiment. Any shape can be used. predictions: np.ndarray Array containing the predictions of the experiment. Any shape can be used. train_targets: Optional[np.ndarray], optional, default=None Array containing the training targets of the experiment. Any shape can be used. train_predictions: Optional[np.ndarray], optional, default=None Array containing the training predictions of the experiment. Any shape can be used. val_targets: Optional[np.ndarray], optional, default=None Array containing the validation targets of the experiment. Any shape can be used. val_predictions: Optional[np.ndarray], optional, default=None Array containing the validation predictions of the experiment. Any shape can be used. time: Optional[float], optional, default=None Time spent to run the experiment. train_history: Optional[np.ndarray], optional, default=None Array containing the training history recorded during the training process. It should be a 1D array with the value of the error on each iteration. val_history: Optional[np.ndarray], optional, default=None Array containing the validation history recorded during the training process. It should be a 1D array with the value of the error on each iteration. best_params: Optional[dict], optional, default=None Dictionary containing the best parameters found during the experiment. It can be used in case that the experiment employs a cross-validation process. It can be left as None if the experiment does not use a cross-validation process or the cross-validation process is splitted in different experiments. best_model: Optional[object], optional, default=None Best model found during the experiment. Returns ------- Result A new `Result` object with the given data. The `Result` is not saved in the disk. You can call the `save()` method to save it. Examples -------- >>> import numpy as np >>> from remayn.result import make_result >>> targets = np.array([1, 2, 3]) >>> predictions = np.array([1.1, 2.2, 3.3]) >>> config = {"model": "linear_regression"} >>> result = make_result("results", config, targets, predictions) >>> result.save() """ # Create a new Result (empty id to create a new one) result = Result(base_path=base_path, config=config) result.set_data( ResultData( targets=targets, predictions=predictions, train_targets=train_targets, train_predictions=train_predictions, val_targets=val_targets, val_predictions=val_predictions, time=time, train_history=train_history, val_history=val_history, best_params=best_params, best_model=best_model, ) ) return result