[ ]:
import sctoolbox
from sctoolbox.utilities import bgcolor
04 - Embedding and clustering
1 - Description
1.1 Embedding
Embeddings are dimension reduction methods to transform high-dimensional data into lower-dimensional representations while preserving the inherent structure and relationships between individual cells.
The sctoolbox supports the Uniform Manifold Approximation and Projection (UMAP) and the t-distributed stochastic neighbor embedding (t-SNE) methods for dimension reduction, with UMAP being set as the default value. To learn more about the differences between those methods and get more insight in the parameter selction have a look here for
umap and here for t-SNE. ### 1.2 Clustering Single cell clustering is used to group individual cells into clusters based on similarities in their gene expression. The clustering allows to identify distinct cell types and characterize cellular heterogeneity within a population. The sctoolbox supports the
leiden and the louvain clustering methods, with the leiden clustering algorithm being newer and recommended to use.
2 - Setup
[ ]:
import sctoolbox
import sctoolbox.tools as tools
import sctoolbox.plotting as pl
import sctoolbox.utils as utils
import scanpy as sc
import pandas as pd
utils.settings_from_config("config.yaml", key="04")
sc.set_figure_params(vector_friendly=True, dpi_save=600, scanpy=False)
3 - Load anndata
[ ]:
adata = utils.load_h5ad("anndata_3.h5ad")
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
display(adata)
display(adata.obs)
display(adata.var)
4 - General input
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
#Column to show in UMAPs
condition_column = 'sample'
#Number of threads to use for multiprocessing
threads = 4
# Search embedding parameters (or set parameters later)
embedding = "umap" #umap or tsne
search_parameters = True
dist_range = (0.1, 0.3, 0.1) # Set min_dist range for umap
spread_range = (1.0, 2.0, 0.5) # Set spread range for umap
n_components = 2 # Number of components for umap
perplexity_range = (30, 60, 10) # perplexity range for tsne
learning_rate_range = (600, 1000, 200) # learning_rate for tsne
# Search different clustering resolutions
search_clustering_parameters = True
clustering_method = "leiden" #leiden/louvain
# Annotate regions to genes
GTF_PATH = "test_data/hg38_genes.gtf" # genes gtf file
5 - Calculate UMAP/TSNE and find best setting
NOTE: min_dist: distances between points to make the plot looks more ‘clustered’
NOTE: spread: The effective scale of embedded points value be de default is 1
[ ]:
if search_parameters:
if embedding == "umap":
pl.search_umap_parameters(adata,
min_dist_range=dist_range,
spread_range=spread_range,
color=condition_column,
n_components=n_components,
threads=threads,
save="UMAP_parameter_search.pdf")
elif embedding == "tsne":
pl.search_tsne_parameters(adata,
perplexity_range=perplexity_range,
learning_rate_range=learning_rate_range,
color=condition_column,
threads=threads,
save="TSNE_parameter_search.pdf")
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
# Final choice of spread / dist for umap
min_dist = 0.2
spread = 1.5
# Final choice of perplexity_range / perplexity_range for tsne
perplexity = 50
learning_rate = 800
[ ]:
# Calculate final embedding
if embedding == "umap":
sc.tl.umap(adata, min_dist=min_dist, spread=spread, n_components=n_components)
elif embedding == "tsne":
sc.tl.tsne(adata, perplexity=perplexity, learning_rate=learning_rate)
6 - Cell clustering
NOTE: resolution: controls the coarseness of the clustering. Higher values lead to more clusters.
[ ]:
# plot different clustering resolutions
if search_clustering_parameters:
pl.search_clustering_parameters(adata, ncols=4, method=clustering_method)
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
# choose final resolution
clustering_column = "leiden_0.5"
6.1 - Reclustering
Here you can use the tools.recluster
function to iteratively adjust clustering
[ ]:
%bgcolor PowderBlue
# combine multiple clusters into one
# skipped when empty
combine = [] # add cluster names
if combine:
tools.recluster(adata=adata,
column=clustering_column,
clusters=combine,
task="join",
embedding=embedding,
key_added="recluster"
)
clustering_column = "recluster"
[ ]:
%bgcolor PowderBlue
# split (recluster) one or more clusters
# skipped when empty
split = [] # add cluster names
resolution=0.15 # 0-1, small values create less clusters
if split:
tools.recluster(adata=adata,
column=clustering_column,
clusters=split,
task="split",
resolution=resolution,
embedding=embedding,
key_added="recluster"
)
clustering_column = "recluster"
[ ]:
# Create final clustering
adata.obs["clustering"] = utils.rename_categories(adata.obs[clustering_column])
6.2 - Final clustering of cells
[ ]:
#Plot final leiden
sc.pl.embedding(adata, basis="X_" + embedding, color=[condition_column, "clustering"], show=False)
pl._save_figure("embedding_clustering.pdf")
7 - Plot distribution of cells across clusters
[ ]:
_ = pl.n_cells_barplot(adata, "clustering", groupby=condition_column,
save="cell_distribution_barplot.pdf")
8 - Generating 3D Object with UMAP coordinates in HTML
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
#plot 3D html for the "clustering" adata.obs, change to individual leiden or other columns if needed
column_3d = "clustering"
[ ]:
if embedding == "umap" and n_components > 2:
pl.plot_3D_UMAP(adata, column_3d, save=f"umap_3d_{column_3d}")
html_file = sctoolbox.settings.full_figure_prefix + f"umap_3d_{column_3d}.html"
from IPython.display import IFrame
display(IFrame(src=html_file, width=800, height=400))
9 - Annotate regions to genes
This function uses UROPA to annotate regions to genes with a gtf file containing the genes as reference.
[ ]:
tools.annotate_adata(adata,
GTF_PATH,
config=None,
best=True,
threads=6,
coordinate_cols=None,
temp_dir="tmp",
inplace=True)
[ ]:
adata.var
10 - Saving adata for next notebook
[ ]:
utils.save_h5ad(adata, "anndata_4.h5ad")
[ ]:
sctoolbox.settings.close_logfile()