Source code for sctoolbox.utils.tables

"""Table related functions."""

import warnings
import pandas as pd
from scipy.stats import zscore
import sctoolbox.utils as utils

# type hint imports
from beartype.typing import Optional, Any, Literal
from beartype import beartype



[docs]
@beartype
def rename_categories(series: pd.Series) -> pd.Series:
    """
    Rename categories in a pandas series to numbers between 1-(number of categories).

    Parameters
    ----------
    series : pd.Series
        Pandas Series to rename categories in.

    Returns
    -------
    pd.Series
        Series with renamed categories.
    """

    series_cat = series.astype("category")
    n_categories = series_cat.cat.categories
    new_names = [str(i) for i in range(1, len(n_categories) + 1)]
    translate_dict = dict(zip(series_cat.cat.categories.tolist(), new_names))
    series_cat = series_cat.cat.rename_categories(translate_dict)

    return series_cat




[docs]
@beartype
def fill_na(df: pd.DataFrame,
            inplace: bool = True,
            replace: dict[str, Any] = {"bool": False, "str": "-", "float": 0, "int": 0, "category": ""}) -> Optional[pd.DataFrame]:
    """
    Fill all NA values in a pandas DataFrame depending on the column data type.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame object with NA values over multiple columns
    inplace : boolean, default True
        Whether the DataFrame object is modified inplace.
    replace :  dict[str, Any], default {"bool": False, "str": "-", "float": 0, "int": 0, "category": ""}
        dict that contains default values to replace nas depedning on data type

    Returns
    -------
    Optional[pd.DataFrame]
        DataFrame with replaced NA values.
    """

    if not inplace:
        df = df.copy()

    # Set default of missing replace value
    replace_def = {"bool": False, "str": "-", "float": 0, "int": 0, "category": ""}
    for t in ["bool", "str", "float", "int", "category"]:
        if t not in replace:
            warnings.warn(f"Value for replace key '{t}' not given. Set to default value: '{replace_def[t]}'")
            replace[t] = replace_def[t]

    for nan_col in df.columns[df.isna().any()]:
        col_type = df[nan_col].dtype.name
        if col_type == "category":
            df[nan_col] = df[nan_col].cat.add_categories(replace[col_type])
            df[nan_col].fillna(replace[col_type], inplace=True)
        elif col_type.startswith("float"):
            df[nan_col].fillna(replace["float"], inplace=True)
        elif col_type.startswith("int"):
            df[nan_col].fillna(replace["int"], inplace=True)
        elif col_type == "object":
            value_set = list({x for x in set(df[nan_col]) if x == x})
            o_type = type(value_set[0]).__name__ if value_set else "str"
            df[nan_col].fillna(replace[o_type], inplace=True)
    if not inplace:
        return df



@beartype
def _sanitize_sheetname(s: str,
                        replace: str = "_") -> str:
    """
    Alters given string to produce a valid excel sheetname.

    https://www.excelcodex.com/2012/06/worksheets-naming-conventions/

    Parameters
    ----------
    s : str
        String to sanitize
    replace : str, default "_"
        Replacement of substrings.

    Returns
    -------
    str
        Valid excel sheetname
    """

    return utils.sanitize_string(s, char_list=["\\", "/", "*", "?", ":", "[", "]"], replace=replace)[0:31]



[docs]
@beartype
def write_excel(table_dict: dict[str, Any],
                filename: str,
                index: bool = False,
                **kwargs: Any) -> None:
    """
    Write a dictionary of tables to a single excel file with one table per sheet.

    Parameters
    ----------
    table_dict : dict[str, Any]
        Dictionary of tables in the format {<sheet_name1>: table, <sheet_name2>: table, (...)}.
    filename : str
        Path to output file.
    index : bool, default False
        Whether to include the index of the tables in file.
    **kwargs : Any
        Keyword arguments passed to pandas.DataFrame.to_excel.

    Raises
    ------
    Exception
        If `table_dict` contains items not of type DataFrame.
    """

    # Check if tables are pandas dataframes
    for name, table in table_dict.items():
        if not isinstance(table, pd.DataFrame):
            raise Exception(f"Table {name} is not a pandas DataFrame!")

    # Setup kwargs
    write_kwargs = {"engine": "xlsxwriter"}  # default: faster than openpyxl
    write_kwargs.update(kwargs)  # overwrite defaults with user input

    # Write to excel
    with pd.ExcelWriter(filename) as writer:
        for name, table in table_dict.items():
            table.to_excel(writer, sheet_name=_sanitize_sheetname(f'{name}'), index=index, **write_kwargs)




[docs]
@beartype
def table_zscore(table: pd.DataFrame,
                 how: Literal["row", "col"] = "row") -> pd.DataFrame:
    """
    Z-score a table.

    Parameters
    ----------
    table : pd.DataFrame
        Table to z-score.
    how : {'row', 'col'}
        Whether to z-score rows or columns.

    Returns
    -------
    pd.DataFrame
        Z-scored table.

    Raises
    ------
    Exception
        If `how` has invalid selection.
    """

    if how == "row":
        counts_z = table.T.apply(zscore).T
    elif how == "col":
        counts_z = table.apply(zscore)
    else:
        # Will not be called due to beartype checking for input
        raise Exception(f"'{how}' is invalid for 'how' - it must be 'row' or 'col'.")

    return counts_z