Source code for sctoolbox.tools.multiomics

"""Tools for multiomics analysis."""
import scanpy as sc
import pandas as pd
from functools import reduce
import warnings

from beartype import beartype
from beartype.typing import Literal

import sctoolbox.utils as utils


[docs] @beartype def merge_anndata(anndata_dict: dict[str, sc.AnnData], join: Literal["inner", "outer"] = "inner") -> sc.AnnData: """ Merge two h5ad files for dual cellxgene deplyoment. Parameters ---------- anndata_dict : dict[str, sc.AnnData] Dictionary with labels as keys and anndata objects as values. join : Literal['inner', 'outer'], default 'inner' Set how to join cells of the adata objects: ['inner', 'outer']. This only affects the cells since the var/gene section is simply added. - 'inner': only keep overlapping cells. - 'outer': keep all cells. This will add placeholder cells/dots to plots currently disabled. Returns ------- sc.AnnData Merged anndata object. Notes ----- Important: Depending on the size of the anndata objects the function takes around 60 to 300 GB of RAM! To save RAM and runtime the function generates a minimal anndata object. Only .X, .var, .obs and .obsm are kept. Layers, .varm, etc is removed. Raises ------ ValueError: If no indices of both adata.obs tables are overlapping. """ if join == "outer": warnings.warn("'outer' join is currently not supported. Proceeding with 'inner' ...") join = "inner" # Generate minimal anndata objects minimal_adata_dict = dict() for label, adata in anndata_dict.items(): minimal_adata_dict[label] = sc.AnnData(X=adata.X, obs=adata.obs, var=adata.var, obsm=dict(adata.obsm)) if not adata.obs.index.is_unique: warnings.warn(f"Obs index of {label} dataset is not unqiue. Running .obs_names_make_unique()..") minimal_adata_dict[label].obs_names_make_unique() if not adata.var.index.is_unique: warnings.warn(f"Var index of {label} dataset is not unqiue. Running .var_names_make_unique()..") minimal_adata_dict[label].var_names_make_unique() # Get cell barcode (obs) intersection all_obs_indices = [single_adata.obs.index for single_adata in list(anndata_dict.values())] obs_intersection = set.intersection(*map(set, all_obs_indices)) obs_intersection = sorted(list(obs_intersection)) if not obs_intersection: raise ValueError("No overlapping indices among the .obs tables. " + "The barcodes of the cells must be identical for all datasets") obs_list = list() obsm_dict = dict() # Add prefix to obsm dict keys, var index and obs columns for label, adata in minimal_adata_dict.items(): # prefix to var index adata.var.index = label + "_" + adata.var.index adata.var.columns = label + "_" + adata.var.columns # prefix to obs columns adata.obs.columns = label + "_" + adata.obs.columns # Reorder adata cells adata = adata[obs_intersection, :] for obsm_key, matrix in dict(adata.obsm).items(): obsm_dict[f"X_{label}_{obsm_key.removeprefix('X_')}"] = matrix # save new adata to dict minimal_adata_dict[label] = adata # save obs in list obs_list.append(adata.obs) # Merge X and var merged_adata = sc.concat(minimal_adata_dict, join="outer", label="source", axis=1) # Merge obs merged_adata.obs = reduce(lambda left, right: pd.merge(left, right, how=join, left_index=True, right_index=True), obs_list) merged_adata.obsm = obsm_dict utils.fill_na(merged_adata.obs) utils.fill_na(merged_adata.var) if len(merged_adata.var) <= 50: warnings.warn("The adata object contains less than 51 genes/var entries. " + "CellxGene will not work. Please add dummy genes to the var table.") return merged_adata