[ ]:
import sctoolbox
from sctoolbox.utilities import bgcolor

02 - QC and filtering


1 - Description

Quality control

Before analysing the single‐cell gene expression data, we must ensure that all cellular barcode data correspond to viable cells. Cell QC is commonly performed based on three QC covariates: the number of counts per barcode (count depth), the number of genes per barcode, and the fraction of counts from mitochondrial genes per barcode […]. The distributions of these QC covariates are examined for outlier peaks that are filtered out by thresholding […]. These outlier barcodes can correspond to dying cells, cells whose membranes are broken, or doublets. For example, barcodes with a low count depth, few detected genes, and a high fraction of mitochondrial counts are indicative of cells whose cytoplasmic mRNA has leaked out through a broken membrane, and thus, only mRNA located in the mitochondria is still conserved […]. In contrast, cells with unexpectedly high counts and a large number of detected genes may represent doublets.

SOURCE: 10.15252/msb.20188746


2 - Setup

[ ]:
import sctoolbox.utilities as utils
import sctoolbox.tools.qc_filter as qc
import sctoolbox.marker_genes as marker_genes
import sctoolbox.plotting as pl
import matplotlib.pyplot as plt
import pandas as pd

utils.settings_from_config("config.yaml", key="02")

3 - Load anndata

Uses the anndata object written by the previous notebook.

[ ]:
adata = utils.load_h5ad("anndata_1.h5ad")

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

4 - QC and filtering


⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

# Set the species of the data
species = "human"

# Set the column in adata.obs containing the biological condition to evaluate
condition_column = "sample"

# Set the column in adata.var containing gene names (or set to None to use adata.var index)
gene_column = None

# Absolute minimum number of genes for pre-selection of cells before QC plotting
min_genes = 1

# Decide whether to remove doublets using scrublet (True) or to skip doublet calculation (False)
filter_doublets = True
threads = 4
doublet_threshold = 0.2

# Whether to try to predicting the sex of samples using the expression of a female gene
predict_sex = True
female_gene = "Xist"  # name of the gene to use for the sex assignment

# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = False

# Removal of gene subsets
filter_mito = True
filter_ribo = False
filter_gender = False

# Optional: Plot STARsolo quality if a path is given
quant_folder = ""

[ ]:
#Ensure that condition column is a category
adata.obs[condition_column] = adata.obs[condition_column].astype("category")

4.1 - Show STARsolo quality (optional)

If the data was mapped using STARsolo, use the parameter to set the path to the STARsolo runs and plot quality measures across runs. The path must be a folder, e.g. “path/to/starsolo_output”, which contains folders per condition e.g. “cond1”, “cond2”, etc.

[ ]:
if quant_folder != "":
    _ = pl.plot_starsolo_quality(quant_folder, save="starsolo_quality.pdf")
    _ = pl.plot_starsolo_UMI(quant_folder, ncol=3, save="starsolo_cell_selection.pdf")

4.2 - Label genes

Mark genes on their general association. E.g. mitochondrial.

[ ]:
qc_vars = marker_genes.label_genes(adata, gene_column=gene_column, species=species)
qc_vars
[ ]:
adata.obs

4.3 - Calculate QC metrics

Create quality control metrics to filter the data on.

[ ]:
adata = qc.calculate_qc_metrics(adata, qc_vars=qc_vars)

4.4 - Calculate doublet scores

[ ]:
# Set filter for number of genes before calculating doublets
n_cells_before = len(adata)
adata = adata[adata.obs["n_genes"] >= min_genes]
n_cells_after = len(adata)
print(f"Filtered out {n_cells_before-n_cells_after} cells which had less than {min_genes} gene(s) expressed.")
[ ]:
if filter_doublets:

    qc.estimate_doublets(adata, groupby=condition_column, threads=threads, threshold=doublet_threshold)

    #Remove the duplicates from adata
    qc.filter_cells(adata, "predicted_doublet", remove_bool=True)

4.5 - Predict sex per sample

[ ]:
if predict_sex:
    qc.predict_sex(adata, groupby=condition_column, gene_column=gene_column, gene=female_gene,
                   save="female_prediction.pdf")

4.6 - Cell filtering


Low and high count depth indicates cells with low integrity and doublets, respectively (DOI: 10.15252/msb.20188746).

[ ]:
# Choose columns to be used for filtering
obs_columns = ["n_genes", "log1p_total_counts"]
obs_columns += ["pct_counts_" + var for var in qc_vars if var != "is_gender"]

utils.add_uns_info(adata, "obs_metrics", obs_columns, how="append")
obs_columns

4.6.1 - Estimate initial thresholds automatically

[ ]:
groupby = condition_column if global_threshold is False else None
initial_thresholds = qc.automatic_thresholds(adata, which="obs", groupby=groupby, columns=obs_columns)
qc.thresholds_as_table(initial_thresholds) # show thresholds

4.6.2 - Customize thresholds via sliders

(Rerun cell if plot is not shown)

[ ]:
%matplotlib widget

#Plot violins and sliders
obs_figure, obs_slider_dict = pl.quality_violin(adata, obs_columns,
                                            groupby=condition_column,
                                            which="obs",
                                            thresholds=initial_thresholds,
                                            global_threshold=global_threshold,
                                            title="Cell quality control (before)",
                                            save="cell_filtering.png")
obs_figure
[ ]:
# Get final thresholds
final_thresholds = pl.get_slider_thresholds(obs_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds
[ ]:
# Show pairwise comparisons of column values w/ thresholds (mean values in case thresholds are grouped)
%matplotlib inline
plt.close()  # close previous figure
if len(final_thresholds) > 1:
    mean_thresholds = qc.get_mean_thresholds(final_thresholds)
    _ = pl.pairwise_scatter(adata.obs, obs_columns, thresholds=mean_thresholds, save="cell_filtering_scatter.pdf")

4.6.3 - Apply final thresholds

Filter the anndata object based on the thresholds in the threshold table.

[ ]:
qc.apply_qc_thresholds(adata, which="obs", thresholds=final_thresholds, groupby=groupby)

4.6.4 - Show data after filtering

[ ]:
%matplotlib inline

#Plot violins and sliders
figure, slider_dict = pl.quality_violin(adata, obs_columns,
                                     groupby=condition_column,
                                     which="obs", ncols=3,
                                     global_threshold = global_threshold,
                                     title="Cell quality control (after)",
                                     save="cell_filtering_final.png")
figure

4.7 - Gene filtering


[ ]:
#Recalculate quality measures for genes
adata = qc.calculate_qc_metrics(adata)
[ ]:
#Remove genes with 0 count
zero_bool = adata.var["n_cells_by_counts"] == 0
adata = adata[:,~zero_bool]

⬐ Fill in input data here ⬎

[ ]:
%bgcolor PowderBlue

#Choose columns for quality control
var_columns = ["n_cells_by_counts", "log1p_mean_counts"]

4.7.1 - Customize thresholds via sliders

(Rerun cell if plot is not shown)

[ ]:
%matplotlib widget

#Plot violins and sliders
var_figure, var_slider_dict = pl.quality_violin(adata, var_columns,
                                            which="var",
                                            title="Gene quality control (before)",
                                            save="gene_filtering.png")
var_figure

4.7.2 - Apply gene filtering

[ ]:
# Get final thresholds
final_thresholds = pl.get_slider_thresholds(var_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds
[ ]:
qc.apply_qc_thresholds(adata, which="var", thresholds=final_thresholds)

4.7.3 - Show data after filtering

[ ]:
%matplotlib inline

#Plot violins and sliders
figure, slider_dict = pl.quality_violin(adata, var_columns,
                                        which="var", ncols=3,
                                        title="Gene quality control (after)",
                                        save="gene_filtering_final.png")
figure

4.7.4 - Filter additional marked genes

Remove genes that are labeled as e.g. mitochondrial genes.

[ ]:
#Remove mitochrondrial genes
if filter_mito is True:
    print("Removing mitochrondrial genes:")
    qc.filter_genes(adata, "is_mito")

#Remove ribosomal genes
if filter_ribo is True:
    print("Removing ribosomal genes:")
    qc.filter_genes(adata, "is_ribo")

#Remove gender genes
if filter_gender is True:
    print("Removing gender genes:")
    qc.filter_genes(adata, "is_gender")

5 - Save filtered adata

Store the final results

[ ]:
adata
[ ]:
#Saving the data
adata_output = "anndata_2.h5ad"
utils.save_h5ad(adata, adata_output)
[ ]:
sctoolbox.settings.close_logfile()