[ ]:

from sctoolbox.utilities import bgcolor

0A - Marker gene computation and analysis

1 - Description

Note: Requires notebook 04_clustering to be run prior to this notebook.

Marker genes, also known as differentially expressed genes (DEG), are genes that are predominantly expressed in a single group. This means that the expression of a small amount of genes is sufficient to assign a cell to a cell type, condition, etc. For example the expression of MERTK, FCGR3A, MRC1 would identify a cell as a Macrophage. Lists of cell type markers can be found here. However, before the identification of e.g. cell types computation of potential marker genes is necessary.

This notebook is aimed at computing lists of potential marker genes for the selected groups/ clusters and reviewing their performance on highlighting said groups. Two different methods are provided for the identification of group marker genes:

The rank_genes_groups() method from scanpy.
DESeq2 a method originally intended for bulk that will be run by first creating pseudobulks from the given groups.

2 - Loading packages

[ ]:

import scanpy as sc
import pandas as pd
import numpy as np
import sctoolbox.utilities as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import matplotlib.backends.backend_pdf
utils.settings_from_config("config.yaml", key="0A")

3 - Loading adata

⬐ Fill in input data here ⬎

[ ]:

%bgcolor PowderBlue

anndata_file = "anndata_4.h5ad"

[ ]:

adata = utils.load_h5ad(anndata_file)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

4 - Automatic markers per cluster using rank_genes_groups

⬐ Fill in input data here ⬎

[ ]:

%bgcolor PowderBlue

# Column names of clustering columns
clustering_cols = ["leiden_0.1"]  # "leiden_0.2", "leiden_0.3", "leiden_0.4", "leiden_0.5", "leiden_0.6", "leiden_0.7", "leiden_0.8", "leiden_0.9"

# Marker genes

# Method for gene ranking
ranking_method = "t-test"
# Top X genes to be reported, None for all genes
n_genes = None

# Marker gene filter

# Minimum fraction of cells in a group that must express a gene to be considered as a marker gene
min_in_group_fraction = 0.25
# Minimum foldchange (+/-) to be considered as a marker gene
min_fold_change = 0.5
# Maximum fraction of cells in other groups that must express a gene to be considered as a marker gene
max_out_group_fraction = 0.8

# Plotting
n_genes_markerplot = 15
marker_style = "dots"  # Either `dots` or `heatmap`.

[ ]:

marker_tables = dict()
for clustering in clustering_cols:

    # Identify markers per cluster (adjust group fraction and fold change to filter genes)
    tools.run_rank_genes(adata, clustering,
                         min_in_group_fraction=min_in_group_fraction,
                         min_fold_change=min_fold_change,
                         max_out_group_fraction=max_out_group_fraction,
                         n_genes=n_genes,
                         ranking_method=ranking_method)

    # Plot dotplot of markers
    _ = pl.rank_genes_plot(adata, key=f"rank_genes_{clustering}_filtered",
                           n_genes=n_genes_markerplot, style=marker_style,
                           save=f"marker_genes_{marker_style}_{clustering}.pdf")

    # Write marker genes to table
    marker_table = tools.get_rank_genes_tables(adata, out_group_fractions=True,
                                               key=f"rank_genes_{clustering}_filtered",
                                               save_excel=f"cluster_marker_genes_{clustering}.xlsx")
    marker_tables[clustering] = marker_table

5 - Plot expression

This section of the notebook plots the marker gene expression for each selected clustering in the selected embedding scatterplot. There is also an option to specify a custom list of genes to be plotted.

⬐ Fill in input data here ⬎

[ ]:

%bgcolor PowderBlue

# Column names of clustering columns
marker_table_cols = ["leiden_0.1"]  # "leiden_0.2", "leiden_0.3", "leiden_0.4", "leiden_0.5", "leiden_0.6", "leiden_0.7", "leiden_0.8", "leiden_0.9"

# Top X marker to be plotted
n_marker = 5

# List of genes additionally shown in the embedding.
custom_gene_list = []

embedding = "umap"  # Either umap or tsne

[ ]:

if embedding == "umap":
    pl_method = sc.pl.umap
elif embedding == "tsne":
    pl_method = sc.pl.tsne
else:
    raise ValueError("Invalid embedding set.")

5.1 - Plot custom gene list

[ ]:

if custom_gene_list:
    pl_method(adata, color=custom_gene_list, cmap=pl.sc_colormap(), ncols=n_marker)
    pl._save_figure(f"{embedding}_custom_gene_list_expression.pdf")

5.2 - Plot cluster marker

[ ]:

for cluster_col in marker_table_cols:
    pdf = matplotlib.backends.backend_pdf.PdfPages(f"{utils.settings.figure_dir}/{embedding}_clustering_{cluster_col}_expression.pdf")
    for cluster, table in marker_tables[cluster_col].items():
        adata.obs['selection'] = [cluster if i else "NA" for i in adata.obs[cluster_col] == cluster]
        adata.uns['selection_colors'] = np.array(["#08bf36", "#c0c2c4"])
        marker = list(table["names"][:n_marker])
        title = [f"Cluster_{cluster} - {gene}" for gene in marker]
        fig = sc.pl.embedding(adata, basis="X_" + embedding, color= ["selection"] + marker, show=False,
                              ncols=n_marker + 1, title= ["Highlighted Cluster"] + title, return_fig=True)
        pdf.savefig(fig)
    pdf.close()

6 - DEG between conditions per cluster using scanpy rank gene groups

⬐ Fill in input data here ⬎

[ ]:

%bgcolor PowderBlue

clustering_col = "leiden_0.1"
condition_col = "chamber"

# Method for gene ranking
deg_ranking_method = "t-test"
# Top X genes to be reported, None for all genes
deg_n_genes = None

# Minimum fraction of cells in a group that must express a gene to be considered as a marker gene
deg_min_in_group_fraction = 0.25
# Minimum foldchange (+/-) to be considered as a marker gene
deg_min_fold_change = 0.5
# Maximum fraction of cells in other groups that must express a gene to be considered as a marker gene
deg_max_out_group_fraction = 0.8

# Plotting
n_genes_deg_plot = 15
deg_style = "dots"  # Either 'dots' or 'heatmap'

[ ]:

for cluster in set(adata.obs[clustering_col]):
    print(f"Cluster {cluster}")
    adata_sub = adata[adata.obs[clustering_col] == cluster]

    # Check if sample count is sufficent
    value_counts = adata_sub.obs[condition_col].value_counts()
    insufficient_size = [i for i in value_counts.index if value_counts[i] == 1]
    if insufficient_size: print(f"Removed conditions due to insufficent size {insufficient_size}")
    adata_sub = adata_sub[~adata_sub.obs[condition_col].isin(insufficient_size)]
    if len(set(adata_sub.obs[condition_col])) < 2:
        print(f"Skipped Cluster {cluster}")
        continue

    tools.run_rank_genes(adata_sub, condition_col,
                         min_in_group_fraction=deg_min_in_group_fraction,
                         min_fold_change=deg_min_fold_change,
                         max_out_group_fraction=deg_max_out_group_fraction,
                         n_genes=deg_n_genes,
                         ranking_method=deg_ranking_method)

    # Plot dotplot of markers
    _ = pl.rank_genes_plot(adata_sub, key=f"rank_genes_{condition_col}_filtered",
                           n_genes=n_genes_deg_plot, style=deg_style,
                           save=f"DEG_{deg_style}_{condition_col}_cluster_{cluster}.pdf")

    # Write marker genes to table
    deg_table = tools.get_rank_genes_tables(adata_sub, out_group_fractions=True,
                                               key=f"rank_genes_{condition_col}_filtered",
                                               save_excel=f"DEG_{condition_col}_cluster_{cluster}.xlsx")

## 7 - Run DEseq2 between conditions/samples

⬐ Fill in input data here ⬎

[ ]:

%bgcolor PowderBlue

# Adjust which columns to use for DEseq2
sample_col = "sample"
condition_col = "chamber"

[ ]:

# Normalize raw counts across cells
d = sc.pp.normalize_total(adata, layer="raw", inplace=False) # returns a dict
adata.layers["raw_norm"] = d["X"]
adata.layers["raw_norm"] = adata.layers["raw_norm"].ceil().astype(int)

[ ]:

# Run DEseq2
deseq_table = tools.run_deseq2(adata, sample_col, condition_col, layer="raw_norm")

[ ]:

deseq_table.head(10)

[ ]:

deseq_table.to_excel(f"{utils.settings.table_dir}/DEseq_{sample_col}_vs_{condition_col}.xlsx")

8 - Save adata

Fix error when saving filtered rank gene names by removing nan in adata.uns[RANK_GENES_KEY][‘names’]

[ ]:

for key in adata.uns.keys():
    if key.startswith("rank_genes"):
        names = list()
        for i in adata.uns[key]["names"]:
            names.append([j if j == j else "" for j in i])
        adata.uns[key]["names"] = pd.DataFrame(data=names).to_records(index=False)

[ ]:

utils.save_h5ad(adata, "anndata_5.h5ad")