Source code for sctoolbox.tools.highly_variable

"""Tools to calculate and annotate highly variable genes."""
import numpy as np
from kneed import KneeLocator
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
import scanpy as sc

from beartype.typing import Optional, Tuple, Any
from beartype import beartype

import sctoolbox.utils as utils
from sctoolbox.plotting.general import _save_figure
from sctoolbox._settings import settings
import sctoolbox.utils.decorator as deco
logger = settings.logger


[docs] @deco.log_anndata @beartype def annot_HVG(anndata: sc.AnnData, min_mean: float = 0.0125, max_iterations: int = 10, hvg_range: Tuple[int, int] = (1000, 5000), step: int | float = 10, inplace: bool = True, save: Optional[str] = None, **kwargs: Any) -> Optional[sc.AnnData]: """ Annotate highly variable genes (HVG). Tries to annotate in given range of HVGs, by gradually in-/ decreasing min_mean of scanpy.pp.highly_variable_genes. Parameters ---------- anndata : sc.AnnData Anndata object to annotate. min_mean : float, default 0.0125 Starting min_mean parameter for finding HVGs. max_iterations : int, default 10 Maximum number of min_mean adjustments. hvg_range : Tuple[int, int], default (1000, 5000) Number of HVGs should be in the given range. Will issue a warning if result is not in range. Default limits are chosen as proposed by https://doi.org/10.15252/msb.20188746. step : int | float, default 10 Value min_mean is adjusted by in each iteration. Will divide min_value (below range) or multiply (above range) by this value. inplace : bool, default False Whether the anndata object is modified inplace. save : Optional[str], default None Path to save the plot to. If None, the plot is not saved. **kwargs : Any Additional arguments forwarded to scanpy.pp.highly_variable_genes(). Notes ----- Logarithmized data is expected. Returns ------- Optional[sc.AnnData] Adds annotation of HVG to anndata object. Information is added to Anndata.var["highly_variable"]. If inplace is False, the function returns None Else returns a chagned copy of the input anndata object. """ adata_m = anndata if inplace else anndata.copy() logger.info("Annotating highy variable genes (HVG)") # adjust min_mean to get a HVG count in a certain range for i in range(max_iterations + 1): sc.pp.highly_variable_genes(adata_m, min_mean=min_mean, inplace=True, **kwargs) # counts True values in column hvg_count = sum(adata_m.var.highly_variable) # adjust min_mean # skip adjustment if in last iteration if i < max_iterations and hvg_count < hvg_range[0]: min_mean /= step elif i < max_iterations and hvg_count > hvg_range[1]: min_mean *= step + 0.00001 # This .000001 is to avoid an infinit loop if the current mean lie in the above if. else: break # warn if outside of range if hvg_count < hvg_range[0] or hvg_count > hvg_range[1]: warnings.warn(f"Number of HVGs not in range. Range is {hvg_range} but counted {hvg_count}.") else: _ = sc.pl.highly_variable_genes(anndata, show=False) # Plot dispersion of HVG _save_figure(save) logger.info("Total HVG=" + str(anndata.var["highly_variable"].sum())) if not inplace: return adata_m
[docs] @deco.log_anndata @beartype def get_variable_features(adata: sc.AnnData, max_cells: Optional[float | int] = None, min_cells: Optional[float | int] = 0, show: bool = True, inplace: bool = True) -> Optional[sc.AnnData]: """ Get the highly variable features of anndata object. Adds the column "highly_variable" to adata.var. If show is True, the plot is shown. Parameters ---------- adata : sc.AnnData The anndata object containing counts for variables. max_cells : Optional[float | int], default None The maximum variability score to set as threshold. Defaults to knee estimated threshold. min_cells : Optional[float | int], default 0 The minimum variability score to set as threshold. show : bool, default True Show plot of variability scores and thresholds. inplace : bool, default True If True, the anndata object is modified. Otherwise, a new anndata object is returned. Notes ----- Designed for scATAC-seq data Raises ------ KeyError If adata.var['n_cells_by_counts'] is not available. Returns ------- Optional[sc.AnnData] If inplace is False, the function returns None If inplace is True, the function returns an anndata object. """ if 'n_cells_by_counts' not in adata.var.columns: raise KeyError("Required column adata.var['n_cells_by_counts'] missing. Please run scanpy.pp.calculate_qc_metrics.") utils.check_module("kneed") utils.check_module("statsmodels") if inplace is False: adata = adata.copy() # get number of cells per feature n_cells = adata.var['n_cells_by_counts'].sort_values(ascending=False) x = np.arange(len(n_cells)) if max_cells is None: # Subset data to reduce computational time target = 10000 step = int(len(n_cells) / target) if step > 0: idx_selection = np.arange(len(n_cells), step=step) n_cells = n_cells[idx_selection] x = x[idx_selection] # Smooth using lowess (prevents early finding of knees due to noise) n_cells = sm.nonparametric.lowess(n_cells, x, return_sorted=False, frac=0.05) # Find knee kneedle = KneeLocator(x, n_cells, curve="convex", direction="decreasing", online=False) max_cells = kneedle.knee_y # Set "highly_variable" column in var adata.var["highly_variable"] = (adata.var['n_cells_by_counts'] <= max_cells) & (adata.var['n_cells_by_counts'] >= min_cells) # Create plot if show is True: fig, ax = plt.subplots() ax.set_xlabel("Ranked features") ax.set_ylabel("Number of cells") ax.plot(x, n_cells) # Horizontal line at knee ax.axhline(max_cells, linestyle="--", color="r") xlim = ax.get_xlim() ax.text(xlim[1], max_cells, " {0:.2f}".format(max_cells), fontsize=12, ha="left", va="center", color="red") ax.axhline(min_cells, linestyle="--", color="b") ax.text(xlim[1], min_cells, " {0:.2f}".format(min_cells), fontsize=12, ha="left", va="center", color="blue") # Return the copy of the adata if inplace is False: return adata