Source code for sctoolbox.tools.gene_correlation

"""Tools for gene-gene correlation."""
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.stats import spearmanr
from scipy.stats import norm
import statsmodels.api
from joblib import Parallel, delayed
from tqdm import tqdm

from beartype import beartype
from beartype.typing import Optional

from sctoolbox.utils.checker import check_columns
from sctoolbox.utils.adata import get_adata_subsets
from sctoolbox.plotting.embedding import umap_marker_overview
from sctoolbox._settings import settings
logger = settings.logger


[docs] @beartype def correlate_conditions(adata: sc.AnnData, gene: str, condition_col: str, condition_A: str, condition_B: str) -> pd.DataFrame: """ Calculate the correlation of a gene expression over two conditions and compares the two conditions. Parameters ---------- adata : sc.AnnData Annotated adata object. gene : str Gene of interest. condition_col : str Column in adata.obs containing conditions. condition_A : str Name of the first condition. condition_B : str Name of the second condition. Returns ------- pd.DataFrame Dataframe containing the correlation of a gene expression over two conditions. Raises ------ ValueError If one or both condition columns are not in adata.obs. """ # Subset adata on conditions check_columns(adata.obs, columns=[condition_col], name="adata.obs") adata_subsets = get_adata_subsets(adata, groupby=condition_col) if not all(x in adata_subsets.keys() for x in [condition_A, condition_B]): raise ValueError(f"One or both conditions ({condition_A}, {condition_B}]) \ could not be found in adata.obs['{condition_col}']") # Calculate correlations corr_A_df = correlate_ref_vs_all(adata_subsets[condition_A], gene) corr_B_df = correlate_ref_vs_all(adata_subsets[condition_B], gene) # Compare correlations comparison = compare_two_conditons(corr_A_df, corr_B_df, adata_subsets[condition_A].shape[1], adata_subsets[condition_B].shape[1]) return comparison
[docs] @beartype def correlate_ref_vs_all(adata: sc.AnnData, ref_gene: str, correlation_threshold: float = 0.4, save: Optional[str] = None) -> pd.DataFrame: """ Calculate the correlation of the reference gene vs all other genes. Additionally, plots umap highlighting correlating gene expression. Parameters ---------- adata : sc.AnnData Annotated data matrix. ref_gene : str Reference gene. correlation_threshold : float, default 0.4 Threshold for correlation value. Correlating genes with a correlation >= threshold will be plotted if plot parameter is set to True. save : str, default None Path to save the figure to. Returns ------- pd.DataFrame Dataframe containing correlation of refrence gene to other genes. """ def spearmanr_of_gene(ref, gene, gene_name): """Get tuple of gene and spearman correlation of gene to reference.""" return (gene_name, spearmanr(ref, gene)) def map_correlation_strength(x): """Map correlation to describing strings.""" if 0 <= x < 0.2: return "very weak (0.00-0.19)" elif x < 0.4: return "weak (0.20-0.39)" elif x < 0.6: return "moderate (0.40-0.59)" elif x < 0.8: return "strong (0.60-0.79)" elif x <= 1: return "very strong (0.80-1.00)" elif np.isnan(x): return x else: raise ValueError("Invalid correlation value.") # Get expression values of reference gene ref = adata[:, ref_gene].to_df()[ref_gene] logger.info(f"Calculating the correlation to {ref_gene}") results = dict(Parallel(n_jobs=-1)(delayed(spearmanr_of_gene)(ref, adata[:, gene].to_df()[gene], gene) for gene in tqdm(adata.var.index.values.tolist()))) corr_df = pd.DataFrame.from_dict(results, orient="index") corr_df.columns = ["correlation", "p-value"] # Adjust p-values corr_df["padj"] = statsmodels.stats.multitest.multipletests(corr_df["p-value"], method="bonferroni")[1] # Add interpretation columns corr_df["correlation_sign"] = np.where(corr_df['correlation'] < 0, "negative", "positive") corr_df['correlation_strength'] = corr_df['correlation'].apply(map_correlation_strength) corr_df["reject_0?"] = np.where(corr_df['padj'] < 0.05, True, False) # Clean up after nan values corr_df.loc[corr_df.isnull().any(axis=1), :] = np.nan if save: to_plot = corr_df[corr_df["correlation"] > correlation_threshold].index.to_list() _ = umap_marker_overview(adata, to_plot, ncols=4, save=save, cbar_label="Relative expr.") return corr_df
[docs] @beartype def compare_two_conditons(df_cond_A: pd.DataFrame, df_cond_B: pd.DataFrame, n_cells_A: int, n_cells_B: int) -> pd.DataFrame: """ Compare two conditions. Parameters ---------- df_cond_A : pd.DataFrame Dataframe containing correlation analysis from correlate_ref_vs_all df_cond_B : pd.DataFrame Dataframe containing correlation analysis from correlate_ref_vs_all n_cells_A : int Number of cells within condition A n_cells_B : int Number of cells within condition B Returns ------- pd.DataFrame Dataframe containing single correlation and Fischer Z transformation """ def independent_corr(gene_row, n_xy, n_ab): """z-transforms correlation coefficient xy (of n cells) andab (of n2 cells) and p-value of the difference.""" if (1 - gene_row['correlation_A']) == 0 or (1 - gene_row['correlation_B']) == 0: return np.nan, np.nan # Fisher's r-to-Z Transformation xy_z = 0.5 * np.log((1 + gene_row['correlation_A']) / (1 - gene_row['correlation_A'])) ab_z = 0.5 * np.log((1 + gene_row['correlation_B']) / (1 - gene_row['correlation_B'])) # fisher1925 - denominator se_diff_r = np.sqrt(1 / (n_xy - 3) + 1 / (n_ab - 3)) # fisher1925 - numerator diff = xy_z - ab_z # fisher1925 - significance test z = abs(diff / se_diff_r) # two-tailed p-value, therefore *2 p = (1 - norm.cdf(z)) * 2 return z, p # Join both correlation tables df_cond = df_cond_A.join(df_cond_B, lsuffix='_A', rsuffix='_B') # Compare both correlation coefficients df_cond['comparison z-score'], df_cond['comparison p-value'] = \ zip(*df_cond.apply(independent_corr, args=(n_cells_A, n_cells_B), axis=1)) return df_cond