[ ]:
from sctoolbox.utilities import bgcolor
from sctoolbox import settings
Preparing adata for cellxgene / MaMPlan creation
1 - Description
1.1 - Preparing for cellxgene
This Notebook prepares the anndata object for cellxgene. This preparation includes: - Removing unnessesary data to keep the resulting h5ad file as small as possible - Renaming columns for a nicer presentation in cellxgene - Converting unsupported datatypes to supported datatypes - Additional fixes for bugs between scanpy, anndata and cellxgene
1.2 - MaMPlan creation
See the MaMpok wiki for more detailed information about each parameter.
1.2.1 - Parameters
Parameter |
Description |
Options |
---|---|---|
project_id |
Project ID, e.g. ‘ext123’, ‘dst123’ |
str |
tool |
Select the cellxgene docker container. |
‘cellxgene-new’, ‘cellxgene-fix’, ‘cellxgene-vip-latest’ |
cluster |
Select the kubernetes cluster. |
‘BN’, ‘GI’, ‘GWDG’, ‘GWDGmanagt’, ‘BN_public’ |
organization |
Select organizations related to the project. Every user in one of the organizations will be able to access the dataset via the BCU repository. |
|
label |
Set label shown in the browser tab. |
str |
user |
List of users that, additonally to the organization, get access to the dataset via the BCU repository. |
List of LDAP user IDs |
owner |
Owner / Responsible person of the dataset. Set to public if public dataset. |
LDAP user ID or public |
analyst |
Analyst of the dataset. If None, analyst is set as current user. |
List of LDAP user ID; LDAP user ID or None |
pubmedid |
Pubmed ID of public datasets. |
Pubmed user ID |
citation |
Citation of public dataset. |
str |
cpu_limit |
Set the limit of cpu cores that can be used by the deplyoment. |
int |
mem_limit |
Set the limit (in GB) of memory that can be used by the deplyoment. |
int |
cpu_request |
Set the requested amount of cpu cores that can be used by the deplyoment. |
int |
mem_request |
Set the requested amount (in GB) of memory that can be used by the deplyoment. |
int |
check_online |
If True, validate certain parameters using an online database. |
bool |
2 - Setup
[ ]:
import sctoolbox.utilities as utils
from packaging import version
import pandas as pd
3 - General Input
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
# sctoolbox settings
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/cellxgene/"
settings.log_file = "../logs/prepare_for_cellxgene_log.txt"
last_notebook_adata = "anndata_4.h5ad"
datatype = "scRNA"
# MaMPlan options
check_online = True
## Project options
project_id = "Test-ID"
tool = "cellxgene-fix" #cellxgene-vip-latest
cluster = "BN"
organization = ["AG-nerds"]
label = None
user = None
owner = "Test-owner"
analyst = None
## Options for public datasets
pubmedid = None
citation = None
## Options for computational resource manangemnt
### Limit
cpu_limit = None
mem_limit = None
### Requested
cpu_request = None
mem_request = None
mamplan_filename = f"{project_id}_MaMPlan.yaml"
4 - Load anndata
[ ]:
adata = utils.load_h5ad(last_notebook_adata)
display(adata)
5 - Prepare adata for cellxgene
In addition, every invalid or problematic datatype is checked for and cast to a fitting datatype if possible.
Note: Keep in mind that the resulting adata object should not be used for further analysis.
[ ]:
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
display(adata)
display(adata.obs)
display(adata.var)
⬐ Fill in input data here ⬎
⬐ Fill in input data here ⬎
[ ]:
%bgcolor PowderBlue
# Keep columns in adata.obs (Cell metadata)
keep_obs = [
"sample",
"batch",
"celltype",
"pct_counts_is_mito",
"pct_counts_is_ribo",
"phase",
"clustering",
"SCSA_pred_celltype",
"marker_pred_celltype"
]
# Rename columns in adata.obs
rename_obs = {
"sample": "Sample",
"batch": "Batch",
"celltype": "Celltype",
"pct_counts_is_mito": "Mitochondiral content (%)",
"pct_counts_is_ribo": "Ribosomal content (%)",
"phase": "Phase",
"clustering": "Final Clustering",
"SCSA_pred_celltype": "Predicted Celltype (SCSA)",
"marker_pred_celltype": "Predicted Celltype (Marker)"
}
# Keep columns in adata.var (Gene metadata)
# An empty list removes all columns
keep_var = []
rename_var = {}
5.1 - Add leiden columns
[ ]:
leiden_cols = [col for col in adata.obs.columns if col.startswith("leiden")]
keep_obs += leiden_cols
rename_obs |= {c: c.replace("_", " ").capitalize() for c in leiden_cols}
5.2 - Clean up adata
[ ]:
utils.prepare_for_cellxgene(adata,
keep_obs=keep_obs,
keep_var=keep_var,
rename_obs=rename_obs,
rename_var=rename_var,
inplace=True)
[ ]:
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
display(adata)
display(adata.obs)
display(adata.var)
5.3 - Save adata
[ ]:
#Saving the data
adata_output = f"{project_id}_cellxgene.h5ad"
utils.save_h5ad(adata, adata_output)
6 - Write MaMPlan
[ ]:
try:
import mampok
import mampok.mamplan_creator as mc
if version.parse(mampok.__version__) < version.parse("2.0.9"):
raise ModuleNotFoundError()
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install the latest mampok version.")
[ ]:
mamplan = mc.SimpleMamplan(
exp_id = project_id,
files = adata_output,
tool = tool,
analyst = analyst if analyst else utils.get_user(),
datatype = datatype,
cluster = cluster,
label = label,
organization = organization,
user = user,
owner = owner,
pubmedid = pubmedid,
citation = citation,
cpu_limit = cpu_limit,
mem_limit = mem_limit,
cpu_request = cpu_request,
mem_request = mem_request,
check_online = check_online
)
[ ]:
mamplan.save(f"{settings.adata_output_dir}/{project_id}")
[ ]: