[ ]:
import sctoolbox
from sctoolbox.utilities import bgcolor

04 - Embedding and clustering


1 - Description

1.1 Embedding

Embeddings are dimension reduction methods to transform high-dimensional data into lower-dimensional representations while preserving the inherent structure and relationships between individual cells. The sctoolbox supports the Uniform Manifold Approximation and Projection (UMAP) and the t-distributed stochastic neighbor embedding (t-SNE) methods for dimension reduction, with UMAP being set as the default value. To learn more about the differences between those methods and get more insight in the parameter selction have a look here for umap and here for t-SNE. ### 1.2 Clustering Single cell clustering is used to group individual cells into clusters based on similarities in their gene expression. The clustering allows to identify distinct cell types and characterize cellular heterogeneity within a population. The sctoolbox supports the leiden and the louvain clustering methods, with the leiden clustering algorithm being newer and recommended to use.


2 - Setup

[ ]:
import scanpy as sc
import matplotlib.pyplot as plt
import sctoolbox.utils as utils
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import pandas as pd

utils.settings_from_config("config.yaml", key="04")

# Set additional options for figures
sc.set_figure_params(vector_friendly=True, dpi_save=600, scanpy=False)

3 - Load anndata

[ ]:
adata = utils.load_h5ad("anndata_3.h5ad")

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

4 - General input

⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

#Column to show in UMAPs
condition_column = "sample"

#Number of threads to use for multiprocessing
threads = 4

# Search embedding parameters (or set parameters later)
embedding = "umap"   #umap or tsne
search_parameters = True
dist_range = (0.1, 1, 0.2) # Set min_dist range for umap
spread_range = (1.0, 2.0, 0.5) # Set spread range for umap
n_components = 2 # Number of components for umap
perplexity_range = (30, 60, 10)        # perplexity range for tsne
learning_rate_range = (600, 1000, 200)   # learning_rate for tsne


# Search different clustering resolutions
search_clustering_parameters = True
clustering_method = "leiden" #leiden/louvain

5 - Calculate UMAP/TSNE and find best setting

  • NOTE: min_dist: distances between points to make the plot looks more ‘clustered’

  • NOTE: spread: The effective scale of embedded points value be de default is 1

[ ]:
if search_parameters:
    if embedding == "umap":
        pl.search_umap_parameters(adata,
                                  min_dist_range=dist_range,
                                  spread_range=spread_range,
                                  color=condition_column,
                                  n_components=n_components,
                                  threads=threads,
                                  save="UMAP_parameter_search.pdf")
    elif embedding == "tsne":
        pl.search_tsne_parameters(adata,
                                  perplexity_range=perplexity_range,
                                  learning_rate_range=learning_rate_range,
                                  color=condition_column,
                                  threads=threads,
                                  save="TSNE_parameter_search.pdf")

⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

# Final choice of spread / dist for umap
min_dist = 0.4
spread = 2.5

# Final choice of perplexity_range / perplexity_range for tsne
perplexity = 50
learning_rate = 800

[ ]:
# Calculate final embedding
if embedding == "umap":
    sc.tl.umap(adata, min_dist=min_dist, spread=spread, n_components=n_components)
elif embedding == "tsne":
    sc.tl.tsne(adata, perplexity=perplexity, learning_rate=learning_rate)

6 - Plot final embedding with quality measures

[ ]:
# Adjust qc columns to show in plot
qc_columns = [condition_column] + list(adata.uns["sctoolbox"]["obs_metrics"])
[ ]:
# Plot final umap/tsne with quality measures
sc.pl.embedding(adata, basis=embedding, color=qc_columns, ncols=3, show=False)
pl._save_figure("embedding_quality.pdf")
[ ]:
# Plot distribution of samples in embedding
_ = pl.plot_group_embeddings(adata, groupby=condition_column, embedding=embedding,
                             save="embedding_sample_distribution.pdf")

7 - Cell clustering


  • NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.

[ ]:
if search_clustering_parameters:
    pl.search_clustering_parameters(adata, ncols=4, method=clustering_method,
                                    embedding=embedding,
                                    save="clustering_search.png")

⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

#Choose final resolution
clustering_column = "leiden_0.5"

7.1 - Reclustering (optional)

Here you can use the tools.recluster function to iteratively adjust clustering. The two cells below are provided as a template for either joining clusters (first cell) or splitting clusters (second cell) copy the cells as needed. Fill in the list at the top of the respective cell to activate joining or splitting.

[ ]:
%bgcolor PowderBlue

# combine multiple clusters into one
# skipped when empty
combine = [] # add cluster names

if combine:
    tools.recluster(adata=adata,
                    column=clustering_column,
                    clusters=combine,
                    task="join",
                    embedding=embedding,
                    key_added="recluster"
                   )

    clustering_column = "recluster"
[ ]:
%bgcolor PowderBlue

# split (recluster) one or more clusters
# skipped when empty
split = [] # add cluster names
resolution=0.15 # 0-1, small values create less clusters

if split:
    tools.recluster(adata=adata,
                    column=clustering_column,
                    clusters=split,
                    task="split",
                    resolution=resolution,
                    embedding=embedding,
                    key_added="recluster"
                   )

    clustering_column = "recluster"
[ ]:
# Create final clustering
adata.obs["clustering"] = utils.rename_categories(adata.obs[clustering_column])
adata.uns["sctoolbox"]["obs_metrics"] = list(adata.uns["sctoolbox"]["obs_metrics"]) + ["clustering"]

7.2 - Final clustering of cells

[ ]:
#Plot final leiden
sc.pl.embedding(adata, basis="X_" + embedding, color=[condition_column, "clustering"], show=False)
pl._save_figure("embedding_clustering.pdf")

8 - Plot distribution of cells across clusters

[ ]:
_ = pl.n_cells_barplot(adata, "clustering", groupby=condition_column,
                       save="cell_distribution_barplot.pdf")

9 - Generating 3D Object with UMAP coordinates in HTML

⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

#plot 3D html for the "clustering" adata.obs, change to individual leiden or other columns if needed
column_3d = "clustering"

[ ]:
if embedding == "umap" and n_components > 2:
    pl.plot_3D_UMAP(adata, column_3d, save=f"umap_3d_{column_3d}")
    html_file = sctoolbox.settings.full_figure_prefix + f"umap_3d_{column_3d}.html"

    from IPython.display import IFrame
    display(IFrame(src=html_file, width=800, height=400))

10 - Saving adata for next notebook

[ ]:
utils.save_h5ad(adata, "anndata_4.h5ad")
[ ]:
sctoolbox.settings.close_logfile()