Source code for sctoolbox.tools.qc_filter

"""Tools for quality control."""
import numpy as np
import pandas as pd
import scanpy as sc
import multiprocessing as mp
import warnings
import anndata
import pkg_resources
import glob
from pathlib import Path
from sklearn.mixture import GaussianMixture
from kneed import KneeLocator
import matplotlib.pyplot as plt

from beartype import beartype
import numpy.typing as npt
from beartype.typing import Optional, Tuple, Union, Any, Literal
# import scrublet as scr

# toolbox functions
import sctoolbox
import sctoolbox.utils as utils
from sctoolbox.plotting import _save_figure
import sctoolbox.utils.decorator as deco
from sctoolbox._settings import settings
logger = settings.logger


###############################################################################
#                        PRE-CALCULATION OF QC METRICS                        #
###############################################################################

[docs] @deco.log_anndata @beartype def calculate_qc_metrics(adata: sc.AnnData, percent_top: Optional[list[int]] = None, inplace: bool = False, **kwargs: Any) -> Optional[sc.AnnData]: """ Calculate the qc metrics using `scanpy.pp.calculate_qc_metrics`. Parameters ---------- adata : sc.AnnData Anndata object the quality metrics are added to. percent_top : Optional[list[int]], default None Which proportions of top genes to cover. inplace : bool, default False If the anndata object should be modified in place. **kwargs : Any Additional parameters forwarded to scanpy.pp.calculate_qc_metrics. Returns ------- Optional[sc.AnnData] Returns anndata object with added quality metrics to .obs and .var. Returns None if `inplace=True`. See Also -------- scanpy.pp.calculate_qc_metrics https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.calculate_qc_metrics.html Examples -------- .. exec_code:: import scanpy as sc import sctoolbox as sct adata = sc.datasets.pbmc3k() print("Columns in .obs before 'calculate_qc_metrics':", adata.obs.columns.tolist()) sct.tools.calculate_qc_metrics(adata, inplace=True) print("Columns in .obs after 'calculate_qc_metrics':", adata.obs.columns.tolist()) """ # add metrics to copy of anndata if not inplace: adata = adata.copy() # remove n_genes from metrics before recalculation to_remove = [col for col in adata.obs.columns if col in ["n_genes", "log1p_n_genes", "n_features", "log1p_n_features"]] adata.obs.drop(columns=to_remove, inplace=True) # compute metrics sc.pp.calculate_qc_metrics(adata=adata, percent_top=percent_top, inplace=True, **kwargs) # Rename metrics adata.obs.rename(columns={"n_genes_by_counts": "n_genes", "log1p_n_genes_by_counts": "log1p_n_genes", "n_features_by_counts": "n_features", "log1p_n_features_by_counts": "log1p_n_features"}, inplace=True) # return modified anndata if not inplace: return adata
[docs] @deco.log_anndata @beartype def predict_cell_cycle(adata: sc.AnnData, species: Optional[str], s_genes: Optional[str | list[str]] = None, g2m_genes: Optional[str | list[str]] = None, inplace: bool = True) -> Optional[sc.AnnData]: """ Assign a score and a phase to each cell depending on the expression of cell cycle genes. Parameters ---------- adata : sc.AnnData Anndata object to predict cell cycle on. species : Optional[str] The species of data. Available species are: human, mouse, rat and zebrafish. If both s_genes and g2m_genes are given, set species=None, otherwise species is ignored. s_genes : Optional[str | list[str]], default None If no species is given or desired species is not supported, you can provide a list of genes for the S-phase or a txt file containing one gene in each row. If only s_genes is provided and species is a supported input, the default g2m_genes list will be used, otherwise the function will not run. g2m_genes : Optional[str | list[str]], default None If no species is given or desired species is not supported, you can provide a list of genes for the G2M-phase or a txt file containing one gene per row. If only g2m_genes is provided and species is a supported input, the default s_genes list will be used, otherwise the function will not run. inplace : bool, default True if True, add new columns to the original anndata object. Returns ------- Optional[sc.AnnData] If inplace is False, return a copy of anndata object with the new column in the obs table. Raises ------ ValueError: 1: If s_genes or g2m_genes is not None and not of type list. 2: If no cellcycle genes available for the given species. 3. If given species is not supported and s_genes or g2m_genes are not given. """ if not inplace: adata = adata.copy() # Check if the given s_genes/g2m_genes are lists/paths/None genes_dict = {"s_genes": s_genes, "g2m_genes": g2m_genes} for key, genes in genes_dict.items(): if genes is not None: # check if s_genes is a file or list if isinstance(genes, str): if Path(genes).is_file(): # check if file exists genes = utils.read_list_file(genes) else: raise FileNotFoundError(f'The file {genes} was not found!') elif isinstance(s_genes, np.ndarray): genes = list(genes) elif not isinstance(genes, list): raise ValueError(f"Please provide a list of genes or a path to a list of genes to s_genes/g2m_genes! Type of {key} is {type(genes)}") # Save genes if key == "s_genes": s_genes = genes elif key == "g2m_genes": g2m_genes = genes # if two lists are given, use both and ignore species if s_genes is not None and g2m_genes is not None: species = None # get gene list for species elif species is not None: species = species.lower() # get path of directory where cell cycles gene lists are saved genelist_dir = pkg_resources.resource_filename("sctoolbox", "data/gene_lists/") # check if given species is available available_files = glob.glob(genelist_dir + "*_cellcycle_genes.txt") available_species = utils.clean_flanking_strings(available_files) if species not in available_species: logger.debug("Species was not found in available species!") logger.debug(f"genelist_dir: {genelist_dir}") logger.debug(f"available_files: {available_files}") logger.debug(f"All files in dir: {glob.glob(genelist_dir + '*')}") raise ValueError(f"No cellcycle genes available for species '{species}'. Available species are: {available_species}") # get cellcylce genes lists path_cellcycle_genes = genelist_dir + f"{species}_cellcycle_genes.txt" cell_cycle_genes = pd.read_csv(path_cellcycle_genes, header=None, sep="\t", names=['gene', 'phase']).set_index('gene') logger.debug(f"Read {len(cell_cycle_genes)} cell cycle genes list from file: {path_cellcycle_genes}") # if one list is given as input, get the other list from gene lists dir if s_genes is not None: logger.info("g2m_genes list is missing! Using default list instead") g2m_genes = cell_cycle_genes[cell_cycle_genes['phase'].isin(['g2m_genes'])].index.tolist() elif g2m_genes is not None: logger.info("s_genes list is missing! Using default list instead") s_genes = cell_cycle_genes[cell_cycle_genes['phase'].isin(['s_genes'])].index.tolist() else: s_genes = cell_cycle_genes[cell_cycle_genes['phase'].isin(['s_genes'])].index.tolist() g2m_genes = cell_cycle_genes[cell_cycle_genes['phase'].isin(['g2m_genes'])].index.tolist() else: raise ValueError("Please provide either a supported species or lists of genes!") # Scale the data before scoring sdata = sc.pp.scale(adata, copy=True) # Score the cells by s phase or g2m phase sc.tl.score_genes_cell_cycle(sdata, s_genes=s_genes, g2m_genes=g2m_genes) # add results to adata adata.obs['S_score'] = sdata.obs['S_score'] adata.obs['G2M_score'] = sdata.obs['G2M_score'] adata.obs['phase'] = sdata.obs['phase'] if not inplace: return adata
[docs] @deco.log_anndata @beartype def estimate_doublets(adata: sc.AnnData, threshold: float = 0.25, inplace: bool = True, plot: bool = True, groupby: Optional[str] = None, threads: int = 4, fill_na: bool = True, **kwargs: Any) -> Optional[sc.AnnData]: """ Estimate doublet cells using scrublet. Adds additional columns "doublet_score" and "predicted_doublet" in adata.obs, as well as a "scrublet" key in adata.uns. Parameters ---------- adata : sc.AnnData Anndata object to estimate doublets for. threshold : float, default 0.25 Threshold for doublet detection. inplace : bool, default True Whether to estimate doublets inplace or not. plot : bool, default True Whether to plot the doublet score distribution. groupby : Optional[str], default None Key in adata.obs to use for batching during doublet estimation. If threads > 1, the adata is split into separate runs across threads. Otherwise each batch is run separately. threads : int, default 4 Number of threads to use. fill_na : bool, default True If True, replaces NA values returned by scrublet with 0 and False. Scrublet returns NA if it cannot calculate a doublet score. Keep in mind that this does not mean that it is no doublet. By setting this parameter true it is assmuned that it is no doublet. **kwargs : Any Additional arguments are passed to scanpy.external.pp.scrublet. Notes ----- Groupby should be set if the adata consists of multiple samples, as this improves the doublet estimation. Returns ------- Optional[sc.AnnData] If inplace is False, the function returns a copy of the adata object. If inplace is True, the function returns None. """ if inplace is False: adata = adata.copy() # Estimate doublets if groupby is not None: all_groups = adata.obs[groupby].astype("category").cat.categories.tolist() if threads > 1: pool = mp.Pool(threads, maxtasksperchild=1) # maxtasksperchild to avoid memory leaks # Run scrublet for each sub data logger.info("Sending {0} batches to {1} threads".format(len(all_groups), threads)) jobs = [] for i, sub in enumerate([adata[adata.obs[groupby] == group] for group in all_groups]): # Clean up adata before sending to thread sub.uns = {} sub.layers = None job = pool.apply_async(_run_scrublet, (sub,), {"threshold": threshold, "verbose": False, **kwargs}) jobs.append(job) pool.close() sctoolbox.utilities.monitor_jobs(jobs, "Scrublet per group") results = [job.get() for job in jobs] else: results = [] for i, sub in enumerate([adata[adata.obs[groupby] == group] for group in all_groups]): logger.info("Scrublet per group: {}/{}".format(i + 1, len(all_groups))) res = _run_scrublet(sub, threshold=threshold, verbose=False, **kwargs) results.append(res) # Collect results for each element in tuples all_obs = [res[0] for res in results] all_uns = [res[1] for res in results] # Merge all simulated scores uns_dict = {"threshold": threshold, "doublet_scores_sim": np.array([])} for uns in all_uns: uns_dict["doublet_scores_sim"] = np.concatenate((uns_dict["doublet_scores_sim"], uns["doublet_scores_sim"])) # Merge all obs tables obs_table = pd.concat(all_obs) obs_table = obs_table.loc[adata.obs_names.tolist(), :] # Sort obs to match original order else: # Run scrublet on adata obs_table, uns_dict = _run_scrublet(adata, threshold=threshold, **kwargs) # Save scores to object # ImplicitModificationWarning adata.obs["doublet_score"] = obs_table["doublet_score"] adata.obs["predicted_doublet"] = obs_table["predicted_doublet"] adata.uns["scrublet"] = uns_dict if fill_na: adata.obs[["doublet_score", "predicted_doublet"]] = ( utils.fill_na(adata.obs[["doublet_score", "predicted_doublet"]], inplace=False)) # Check if all values in colum are of type boolean if adata.obs["predicted_doublet"].dtype != "bool": logger.warning("Could not estimate doublets for every barcode. Columns can contain NAN values.") # Plot the distribution of scrublet scores if plot is True: sc.external.pl.scrublet_score_distribution(adata) # Return adata (or None if inplace) if inplace is False: return adata
@beartype def _run_scrublet(adata: sc.AnnData, **kwargs: Any) -> Tuple[pd.DataFrame, dict[str, Union[np.ndarray, float, dict[str, float]]]]: """ Thread-safe wrapper for running scrublet, which also takes care of catching any warnings. Parameters ---------- adata : sc.AnnData Anndata object to estimate doublets for. **kwargs : Any Additional arguments are passed to scanpy.external.pp.scrublet. Returns ------- Tuple[pd.DataFrame, dict[str, Union[np.ndarray, float, dict[str, float]]]] Tuple containing .obs and .uns["scrublet"] of the adata object after scrublet. """ with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning, message="Received a view of an AnnData*") warnings.filterwarnings("ignore", category=anndata.ImplicitModificationWarning, message="Trying to modify attribute `.obs`*") # because adata is a view # X = adata.X # scrub = scr.Scrublet(X) # doublet_scores, predicted_doublets = scrub.scrub_doublets() # adata.obs["doublet_score"] = doublet_scores # adata.obs["predicted_doublet"] = predicted_doublets sc.external.pp.scrublet(adata, copy=False, **kwargs) return (adata.obs, adata.uns["scrublet"])
[docs] @deco.log_anndata @beartype def predict_sex(adata: sc.AnnData, groupby: str, gene: str = "Xist", gene_column: Optional[str] = None, threshold: float = 0.3, plot: bool = True, save: Optional[str] = None, **kwargs: Any) -> None: """ Predict sex based on expression of Xist (or another gene). Parameters ---------- adata : sc.AnnData An anndata object to predict sex for. groupby : str Column in adata.obs to group by. gene : str, default "Xist" Name of a female-specific gene to use for estimating Male/Female split. gene_column : Optional[str], default None Name of the column in adata.var that contains the gene names. If not provided, adata.var.index is used. threshold : float, default 0.3 Threshold for the minimum fraction of cells expressing the gene for the group to be considered "Female". plot : bool, default True Whether to plot the distribution of gene expression per group. save : Optional[str], default None If provided, the plot will be saved to this path. **kwargs : Any Additional arguments are passed to scanpy.pl.violin. Notes ----- adata.X will be converted to numpy.ndarray if it is of type numpy.matrix. Returns ------- None """ # Normalize data before estimating expression logger.info("Normalizing adata") adata_copy = adata.copy() # ensure that adata is not changed during normalization sc.pp.normalize_total(adata_copy, target_sum=None) sc.pp.log1p(adata_copy) # Get expression of gene per cell if gene_column is None: gene_names_lower = [s.lower() for s in adata_copy.var.index] else: gene_names_lower = [s.lower() for s in adata_copy.var[gene_column]] gene_index = [i for i, gene_name in enumerate(gene_names_lower) if gene_name == gene.lower()] if len(gene_index) == 0: logger.info("Selected gene is not present in the data. Prediction is skipped.") return # If adata.X is of type matrix convert to ndarray if isinstance(adata_copy.X, np.matrix): adata_copy.X = adata_copy.X.getA() # Try to flatten for adata.X np.ndarray. If not flatten for scipy sparse matrix try: adata_copy.obs["gene_expr"] = adata_copy.X[:, gene_index].flatten() except AttributeError: adata_copy.obs["gene_expr"] = adata_copy.X[:, gene_index].todense().A1 # Estimate which samples are male/female logger.info("Estimating male/female per group") assignment = {} for group, table in adata_copy.obs.groupby(groupby): n_cells = len(table) n_expr = sum(table["gene_expr"] > 0) frac = n_expr / n_cells if frac >= threshold: assignment[group] = "Female" else: assignment[group] = "Male" # Add assignment to adata.obs df = pd.DataFrame().from_dict(assignment, orient="index") df.columns = ["predicted_sex"] if "predicted_sex" in adata.obs.columns: adata.obs.drop(columns=["predicted_sex"], inplace=True) adata.obs = adata.obs.merge(df, left_on=groupby, right_index=True, how="left") # Plot overview if chosen if plot: logger.info("Plotting violins") groups = adata.obs[groupby].unique() n_groups = len(groups) fig, axarr = plt.subplots(1, 2, sharey=True, figsize=[5 + len(groups) / 5, 4], gridspec_kw={'width_ratios': [min(4, n_groups), n_groups]}) # Plot histogram of all values axarr[0].hist(adata_copy.obs["gene_expr"], bins=30, orientation="horizontal", density=True, color="grey") axarr[0].invert_xaxis() axarr[0].set_ylabel(f"Normalized {gene} expression") # Plot violins per group + color for female cells sc.pl.violin(adata_copy, keys="gene_expr", groupby=groupby, jitter=False, ax=axarr[1], show=False, order=groups, **kwargs) axarr[1].set_xticklabels(groups, rotation=45, ha="right") axarr[1].set_ylabel("") xlim = axarr[1].get_xlim() for i, group in enumerate(groups): if assignment[group] == "Female": color = "red" alpha = 0.3 else: color = None alpha = 0 axarr[1].axvspan(i - 0.5, i + 0.5, color=color, zorder=0, alpha=alpha, linewidth=0) axarr[1].set_xlim(xlim) axarr[1].set_title("Prediction of female groups") _save_figure(save)
############################################################################### # STEP 1: FINDING AUTOMATIC CUTOFFS # ############################################################################### @beartype def _get_thresholds(data: npt.ArrayLike, max_mixtures: int = 5, n_std: int | float = 3, plot: bool = True) -> dict[str, float]: """ Get automatic min/max thresholds for input data array. The function will fit a gaussian mixture model, and find the threshold based on the mean and standard deviation of the largest mixture in the model. Parameters ---------- data : npt.ArrayLike Array of data to find thresholds for. max_mixtures : int, default 5 Maximum number of gaussian mixtures to fit. n_std : int | float, default 3 Number of standard deviations from distribution mean to set as min/max thresholds. plot : bool, default True If True, will plot the distribution of BIC and the fit of the gaussian mixtures to the data. Returns ------- dict[str, float] Dictionary with min and max thresholds. """ # Get numpy array if input was pandas series or list data_type = type(data).__name__ if data_type == "Series": data = data.values elif data_type == "list": data = np.array(data) # Attempt to reshape values data = data.reshape(-1, 1) # Fit data with gaussian mixture n_list = list(range(1, max_mixtures + 1)) # 1->max mixtures per model models = [None] * len(n_list) for i, n in enumerate(n_list): models[i] = GaussianMixture(n).fit(data) # Evaluate quality of models # AIC = [m.aic(data) for m in models] BIC = [m.bic(data) for m in models] # Choose best number of mixtures try: kn = KneeLocator(n_list, BIC, curve='convex', direction='decreasing') M_best = models[kn.knee - 1] # -1 to get index except Exception: # Knee could not be found; use the normal distribution estimated using one gaussian M_best = models[0] # Which is the largest component? And what are the mean/variance of this distribution? weights = M_best.weights_ i = np.argmax(weights) dist_mean = M_best.means_[i][0] dist_std = np.sqrt(M_best.covariances_[i][0][0]) # Threshold estimation thresholds = {"min": dist_mean - dist_std * n_std, "max": dist_mean + dist_std * n_std} # ------ Plot if chosen -------# if plot: fig, axarr = plt.subplots(1, 2, figsize=(7, 3), constrained_layout=True) axarr = axarr.ravel() # Plot distribution of BIC # plt.plot(n_list, AIC, color="red", label="AIC") axarr[0].plot(n_list, BIC, color="blue") axarr[0].set_xlabel("Number of mixtures") axarr[0].set_ylabel("BIC") # Plot distribution of gaussian mixtures min_x = min(data) max_x = max(data) x = np.linspace(min_x, max_x, 1000).reshape(-1, 1) logprob = M_best.score_samples(x) responsibilities = M_best.predict_proba(x) pdf = np.exp(logprob) pdf_individual = responsibilities * pdf[:, np.newaxis] axarr[1].hist(data, density=True) axarr[1].set_xlabel("Value") axarr[1].set_ylabel("Density") for i in range(M_best.n_components): w = weights[i] * 100 axarr[1].plot(x, pdf_individual[:, i], label=f"Component {i+1} ({w:.0f}%)") axarr[1].axvline(thresholds["min"], color="red", linestyle="--") axarr[1].axvline(thresholds["max"], color="red", linestyle="--") axarr[1].legend(bbox_to_anchor=(1.05, 1), loc=2) # locate legend outside of plot return thresholds
[docs] @beartype def automatic_thresholds(adata: sc.AnnData, which: Literal["obs", "var"] = "obs", groupby: Optional[str] = None, columns: Optional[list[str]] = None) -> dict[str, dict[str, Union[float, dict[str, float]]]]: """ Get automatic thresholds for multiple data columns in adata.obs or adata.var. Parameters ---------- adata : sc.AnnData Anndata object to find thresholds for. which : Literal["obs", "var"], default "obs" Which data to find thresholds for. Either "obs" or "var". groupby : Optional[str], default None Group rows by the column given in 'groupby' to find thresholds independently per group columns : Optional[list[str]], default None Columns to calculate automatic thresholds for. If None, will take all numeric columns. Returns ------- dict[str, dict[str, Union[float, dict[str, float]]]] A dict containing thresholds for each data column, either grouped by groupby or directly containing "min" and "max" per column. Raises ------ ValueError: If which is not set to 'obs' or 'var' """ # Find out which data to find thresholds for if which == "obs": table = adata.obs elif which == "var": table = adata.var # Establish which columns to find thresholds for if columns is None: columns = list(table.select_dtypes(np.number).columns) # Check groupby if groupby is not None: if groupby not in table.columns: raise ValueError() # Get threshold per data column (and groupby if chosen) thresholds = {} for col in columns: if groupby is None: data = table[col].values data[np.isnan(data)] = 0 d = _get_thresholds(data, plot=False) thresholds[col] = d else: thresholds[col] = {} # initialize to fill in per group for group, subtable in table.groupby(groupby): data = subtable[col].values data[np.isnan(data)] = 0 d = _get_thresholds(data, plot=False) thresholds[col][group] = d return thresholds
[docs] @beartype def thresholds_as_table(threshold_dict: dict[str, dict[str, float | int | dict[str, int | float]]]) -> pd.DataFrame: """ Show the threshold dictionary as a table. Parameters ---------- threshold_dict : dict[str, dict[str, float | int | dict[str, int | float]]] Dictionary with thresholds. Returns ------- pd.DataFrame """ rows = [] for column in threshold_dict: if "min" in threshold_dict[column] or "max" in threshold_dict[column]: row = [column, np.nan, threshold_dict[column].get("min", np.nan), threshold_dict[column].get("max", np.nan)] rows.append(row) else: for group in threshold_dict[column]: row = [column, group, threshold_dict[column][group].get("min", np.nan), threshold_dict[column][group].get("max", np.nan)] rows.append(row) # Assemble table df = pd.DataFrame(rows) if len(df) > 0: # df can be empty if no valid thresholds were input df.columns = ["Parameter", "Group", "Minimum", "Maximum"] # Remove group column if no thresholds had groups if df["Group"].isnull().sum() == df.shape[0]: df.drop(columns="Group", inplace=True) # Remove duplicate rows df.drop_duplicates(inplace=True) return df
###################################################################################### # STEP 2: DEFINE CUSTOM CUTOFFS # ###################################################################################### @beartype def _validate_minmax(d: dict) -> None: """Validate that the dict 'd' contains the keys 'min' and 'max'.""" allowed = set(["min", "max"]) keys = set(d.keys()) not_allowed = len(keys - allowed) if not_allowed > 0: raise ValueError("Keys {0} not allowed".format(not_allowed))
[docs] @beartype def validate_threshold_dict(table: pd.DataFrame, thresholds: dict[str, dict[str, int | float] | dict[str, dict[str, int | float]]], groupby: Optional[str] = None) -> None: """ Validate threshold dictionary. Thresholds can be in the format: .. code-block:: python thresholds = {"chrM_percent": {"min": 0, "max": 10}, "total_reads": {"min": 1000}} Or per group in 'groupby': .. code-block:: python thresholds = {"chrM_percent": { "Sample1": {"min": 0, "max": 10}, "Sample2": {"max": 5} }, "total_reads": {"min": 1000}} Parameters ---------- table : pd.DataFrame Table to validate thresholds for. thresholds : dict[str, dict[str, int | float] | dict[str, dict[str, int | float]]] Dictionary of thresholds to validate. groupby : Optional[str], default None Column for grouping thresholds. Raises ------ ValueError If the threshold dict is not valid. """ if groupby is not None: groups = table[groupby] # Check if all columns in thresholds are available threshold_columns = thresholds.keys() not_found = [col for col in threshold_columns if col not in table.columns] if len(not_found) > 0: raise ValueError("Column(s) '{0}' given in thresholds are not found in table".format(not_found)) # Check the format of individual column thresholds for col in thresholds: if groupby is None: # expecting one threshold for all cells _validate_minmax(thresholds[col]) else: # Expecting either one threshold or a threshold for each sample for key in thresholds[col]: if key in groups: minmax_dict = thresholds[col][key] _validate_minmax(minmax_dict) else: # this is a minmax threshold _validate_minmax(thresholds[col])
[docs] @deco.log_anndata @beartype def get_thresholds_wrapper(adata: sc.AnnData, manual_thresholds: dict, only_automatic_thresholds: bool = True, groupby: Optional[str] = None) -> dict[str, dict[str, Union[float | int, dict[str, float | int]]]]: """ Get the thresholds for the filtering. Parameters ---------- adata : sc.AnnData Anndata object to find QC thresholds for. manual_thresholds : dict[str, dict[str, Union[float, dict[str, float]]]] Dictionary containing manually set thresholds only_automatic_thresholds : bool, default True If True, only set automatic thresholds. groupby : Optional[str], default None Group cells by column in adata.obs. Returns ------- dict[str, dict[str, Union[float | int, dict[str, float | int]]]] Dictionary containing the thresholds """ manual_thresholds = get_keys(adata, manual_thresholds) if only_automatic_thresholds: keys = list(manual_thresholds.keys()) thresholds = automatic_thresholds(adata, which="obs", columns=keys, groupby=groupby) return thresholds else: if groupby: samples = [] current_sample = None for sample in adata.obs[groupby]: if current_sample != sample: samples.append(sample) current_sample = sample # thresholds which are not set by the user are set automatically for key, value in manual_thresholds.items(): if value['min'] is None or value['max'] is None: auto_thr = automatic_thresholds(adata, which="obs", columns=[key], groupby=groupby) manual_thresholds[key] = auto_thr[key] else: if groupby: thresholds = {} for sample in samples: thresholds[sample] = value else: thresholds = {key: value} manual_thresholds[key] = thresholds return manual_thresholds
[docs] @beartype def get_keys(adata: sc.AnnData, manual_thresholds: dict[str, Any]) -> dict[str, dict[str, Union[float | int, dict[str, float | int]]]]: """ Get threshold dictionary with keys that overlap with adata.obs.columns. Parameters ---------- adata : sc.AnnData Anndata object manual_thresholds : dict[str, Any] Dictionary with adata.obs colums as keys. Returns ------- dict[str, dict[str, Union[float | int, dict[str, float | int]]]] Dictionary with key - adata.obs.column overlap """ m_thresholds = {} legend = adata.obs.columns for key, value in manual_thresholds.items(): if key in legend: m_thresholds[key] = value else: logger.info('column: ' + key + ' not found in adata.obs') return m_thresholds
[docs] @beartype def get_mean_thresholds(thresholds: dict[str, Any]) -> dict[str, Any]: """Convert grouped thresholds to global thresholds by taking the mean across groups.""" global_thresholds = {} for key, adict in thresholds.items(): global_thresholds[key] = {} if "min" in adict or "max" in adict: # already global threshold global_thresholds[key] = adict else: min_values = [v.get("min", None) for v in adict.values() if "min" in v] if len(min_values) > 0: global_thresholds[key]["min"] = np.mean(min_values) max_values = [v.get("max", None) for v in adict.values() if "max" in v] if len(max_values) > 0: global_thresholds[key]["max"] = np.mean(max_values) return global_thresholds
############################################################################### # STEP 3: APPLYING CUTOFFS # ###############################################################################
[docs] @deco.log_anndata @beartype def apply_qc_thresholds(adata: sc.AnnData, thresholds: dict[str, Any], which: Literal["obs", "var"] = "obs", groupby: Optional[str] = None, inplace: bool = True) -> Optional[sc.AnnData]: """ Apply QC thresholds to anndata object. Parameters ---------- adata : sc.AnnData Anndata object to filter. thresholds : dict[str, Any] Dictionary of thresholds to apply. which : Literal["obs", "var"], default 'obs' Which table to filter on. Must be one of "obs" / "var". groupby : Optional[str], default None Column in table to group by. inplace : bool, default True Change adata inplace or return a changed copy. Returns ------- Optional[sc.AnnData] Anndata object with QC thresholds applied. Raises ------ ValueError: 1: If the keys in thresholds do not match with the columns in adata.[which]. 2: If grouped thesholds are not found. For example do not contain min and max values. 3: If thresholds do not contain min and max values. """ table = adata.obs if which == "obs" else adata.var # Cells or genes? For naming in log prints if which == "obs": name = "cells" else: name = ".var features" # Check if all columns are found in adata not_found = list(set(thresholds) - set(table.columns)) if len(not_found) > 0: logger.info("{0} threshold columns were not found in adata and could therefore not be applied. These columns are: {1}".format(len(not_found), not_found)) thresholds = {k: thresholds[k] for k in thresholds if k not in not_found} if len(thresholds) == 0: raise ValueError(f"The thresholds given do not match the columns given in adata.{which}. Please adjust the 'which' parameter if needed.") if groupby is not None: groups = table[groupby].unique() # Check that thresholds contain min/max for column, d in thresholds.items(): if 'min' not in d and 'max' not in d: if groupby is not None: keys = d.keys() not_found = list(set(keys) - set(groups)) if len(not_found) > 0: raise ValueError(f"{len(not_found)} groups from thresholds were not found in adata.obs[{groupby}]. These groups are: {not_found}") else: raise ValueError("Error in threshold format: Thresholds must contain min or max per column, or a threshold per group in groupby") # Apply thresholds for column, d in thresholds.items(): # Update size of table table = adata.obs if which == "obs" else adata.var # Collect boolean array of rows to select of table global_threshold = False # can be overwritten if thresholds are global excluded = np.array([False] * len(table)) if groupby is not None: if "min" in d or "max" in d: global_threshold = True else: for group in d: minmax_dict = d[group] group_bool = table[groupby] == group if "min" in minmax_dict: excluded = excluded | (group_bool & (table[column] < minmax_dict["min"])) if "max" in minmax_dict: excluded = excluded | (group_bool & (table[column] > minmax_dict["max"])) else: global_threshold = True # Select using a global threshold if global_threshold is True: minmax_dict = d if "min" in minmax_dict: excluded = excluded | (table[column] < minmax_dict["min"]) # if already excluded, or if excluded by min if "max" in minmax_dict: excluded = excluded | (table[column] > minmax_dict["max"]) # Apply filtering included = ~excluded if inplace: # NOTE: these are privat anndata functions so they might change without warning! if which == "obs": adata._inplace_subset_obs(included) else: adata._inplace_subset_var(included) else: if which == "obs": adata = adata[included] else: adata = adata[:, included] # filter on var logger.info(f"Filtering based on '{column}' from {len(table)} -> {sum(included)} {name}") if inplace is False: return adata
############################################################################### # STEP 4: ADDITIONAL FILTERING # ############################################################################### @beartype def _filter_object(adata: sc.AnnData, filter: str | list[str], which: Literal["obs", "var"] = "obs", remove_bool: bool = True, inplace: bool = True) -> Optional[sc.AnnData]: """Filter an adata object based on a filter on either obs (cells) or var (genes). Is called by filter_cells and filter_genes.""" # Decide which element type (genes/cells) we are dealing with if which == "obs": table = adata.obs table_name = "adata.obs" element_name = "cells" else: table = adata.var table_name = "adata.var" element_name = "genes" n_before = len(table) # genes is either a string (column in .var table) or a list of genes to remove if isinstance(filter, str): if filter not in table.columns: raise ValueError(f"Column {filter} not found in {table_name}.columns") if table[filter].dtype.name != "bool": raise ValueError(f"Column {filter} contains values that are not of type boolean") boolean = table[filter].values if remove_bool is True: boolean = ~boolean else: # Check if all genes/cells are found in adata not_found = list(set(filter) - set(table.index)) if len(not_found) > 0: logger.info(f"{len(not_found)} {element_name} were not found in adata and could therefore not be removed. These genes are: {not_found}") boolean = ~table.index.isin(filter) # Remove genes from adata if inplace: if which == "obs": adata._inplace_subset_obs(boolean) elif which == "var": adata._inplace_subset_var(boolean) # boolean is the included genes else: if which == "obs": adata = adata[boolean] elif which == "var": adata = adata[:, boolean] n_after = adata.shape[0] if which == "obs" else adata.shape[1] filtered = n_before - n_after logger.info(f"Filtered out {filtered} {element_name} from adata. New number of {element_name} is: {n_after}") if inplace is False: return adata
[docs] @deco.log_anndata @beartype def filter_cells(adata: sc.AnnData, cells: str | list[str], remove_bool: bool = True, inplace: bool = True) -> Optional[sc.AnnData]: """ Remove cells from anndata object. Parameters ---------- adata : sc.AnnData Anndata object to filter. cells : str | list[str] A column in .obs containing boolean indicators or a list of cells to remove from object .obs table. remove_bool : bool, default True Is used if genes is a column in .obs table. If True, remove cells that are True. If False, remove cells that are False. inplace : bool, default True If True, filter inplace. If False, return filtered adata object. Returns ------- Optional[sc.AnnData] If inplace is False, returns the filtered Anndata object. If inplace is True, returns None. """ ret = _filter_object(adata, cells, which="obs", remove_bool=remove_bool, inplace=inplace) return ret
[docs] @deco.log_anndata @beartype def filter_genes(adata: sc.AnnData, genes: str | list[str], remove_bool: bool = True, inplace: bool = True) -> Optional[sc.AnnData]: """ Remove genes from adata object. Parameters ---------- adata : sc.AnnData Annotated data matrix object to filter genes : str | list[str] A column containing boolean indicators or a list of genes to remove from object .var table. remove_bool : bool, default True Is used if genes is a column in .var table. If True, remove genes that are True. If False, remove genes that are False. inplace : bool, default True If True, filter inplace. If False, return filtered adata object. Returns ------- Optional[sc.AnnData] If inplace is False, returns the filtered Anndata object. If inplace is True, returns None. """ ret = _filter_object(adata, genes, which="var", remove_bool=remove_bool, inplace=inplace) return ret