Source code for remayn.report.excel

from pathlib import Path
from typing import List, Optional, Union

import pandas as pd


[docs]def create_excel_summary_report(
    df: pd.DataFrame,
    destination_path: Union[str, Path],
    group_columns: List[str],
    *,
    individual_sheet_name: str = "Individual",
    average_sheet_name: str = "Average",
    std_sheet_name: str = "Std",
    excel_writer: Optional[pd.ExcelWriter] = None,
):
    """Creates a summary report of the given DataFrame and saves it to an Excel file.
    It groups the rows using the given `group_columns` and calculates the mean and
    standard deviation of the numeric columns for each group.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to create the report from.
    destination_path : Union[str, Path]
        The path to save the Excel file. It is ignored if `excel_writer` is provided.
    group_columns : List[str]
        The columns to group the rows by.
    individual_sheet_name : str, optional, default="Individual"
        The name of the sheet for the individual rows.
    average_sheet_name : str, optional, default="Average"
        The name of the sheet for the mean values.
    std_sheet_name : str, optional, default="Std"
        The name of the sheet for the standard deviation values.
    excel_writer : Optional[pd.ExcelWriter], optional, default=None
        A pd.ExcelWriter object that will be used to write the dataframes to an excel
        file. If None, a new Excel file will be created, by default None. Using an
        external ExcelWriter can be useful if you want to write additional sheets to
        the report excel file.

    Returns
    -------
    Path
        The path to the created Excel file.
    """

    destination_path = Path(destination_path)

    mean_df = df.groupby(group_columns).mean(numeric_only=True).reset_index()
    std_df = df.groupby(group_columns).std(numeric_only=True).reset_index()

    destination_path.parent.mkdir(parents=True, exist_ok=True)

    if excel_writer is None:
        with pd.ExcelWriter(destination_path) as writer:
            df.to_excel(writer, sheet_name=individual_sheet_name, index=False)
            mean_df.to_excel(writer, sheet_name=average_sheet_name, index=False)
            std_df.to_excel(writer, sheet_name=std_sheet_name, index=False)
    else:
        df.to_excel(excel_writer, sheet_name=individual_sheet_name, index=False)
        mean_df.to_excel(excel_writer, sheet_name=average_sheet_name, index=False)
        std_df.to_excel(excel_writer, sheet_name=std_sheet_name, index=False)

    return destination_path


[docs]def create_excel_columns_report(
    df: pd.DataFrame,
    destination_path: Union[str, Path],
    metric_columns: List[str],
    pivot_index: str,
    pivot_columns: List[str],
    *,
    excel_writer: Optional[pd.ExcelWriter] = None,
):
    """Create an Excel report with multiple sheets, each containing a pivot table of the
    given DataFrame. The pivot table is created by pivoting the DataFrame using the
    given `pivot_index` and `pivot_columns`. The values of the pivot table are the
    columns specified in `metric_columns`. Each sheet in the Excel file corresponds to
    a column in `metric_columns`.

    To create an Excel file that contains one sheet for each metric, one column for each
    combination of methodology and dataset, and one row for each seed, the metric_columns
    should be the list of metric columns, the pivot_index should be the seed column and
    the pivot_columns should be the methodology and dataset columns.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to create the report from.
    destination_path : Union[str, Path]
        The path to save the Excel file. It is ignored if `excel_writer` is provided.
    metric_columns : List[str]
        The columns to create pivot tables for.
    pivot_index : str
        The column to use as the index of the pivot table.
    pivot_columns : List[str]
        The columns to use as the columns of the pivot table.
    excel_writer : Optional[pd.ExcelWriter], optional, default=None
        A pd.ExcelWriter object that will be used to write the dataframes to an excel
        file. If None, a new Excel file will be created, by default None. Using an
        external ExcelWriter can be useful if you want to write additional sheets to
        the report excel file.

    Returns
    -------
    Path
        The path to the created Excel file.
    """

    destination_path = Path(destination_path)

    destination_path.parent.mkdir(parents=True, exist_ok=True)

    df = df.copy()
    for column in pivot_columns:
        df[column] = df[column].astype(str)

    def _create_pivot_df(column):
        pivot_df = (
            df.pivot(
                index=pivot_index,
                columns=pivot_columns,
                values=column,
            )
            .reset_index()
            .drop(columns=pivot_index, level=0)
        )
        pivot_df.columns = pivot_df.columns.map("_".join)

        return pivot_df

    if excel_writer is None:
        with pd.ExcelWriter(destination_path) as writer:
            for column in metric_columns:
                pivot_df = _create_pivot_df(column)

                pivot_df.to_excel(writer, sheet_name=column, index=False)
    else:
        for column in metric_columns:
            pivot_df = _create_pivot_df(column)

            pivot_df.to_excel(excel_writer, sheet_name=column, index=False)

    return destination_path