Tutorials¶

This page contains detailed tutorials for common scTOP workflows.

Tutorial 1: Basic Cell Type Identification¶

Goal: Score samples against a reference basis to identify cell types.

Step 1: Load Reference Basis¶

import sctop as top
import pandas as pd

# List available references
print(top.list_available_bases())

# Load Mouse Cell Atlas basis
basis, metadata = top.load_basis("MCKO legacy")

print(f"Basis: {basis.shape[0]} genes × {basis.shape[1]} cell types")
print(f"Cell types: {list(basis.columns)[:10]}...")  # First 10

Step 2: Load Your Data¶

import anndata as ad

# Load your scRNA-seq data
# Can be from: Scanpy, Cell Ranger, CSV, etc.
adata = ad.read_h5ad("your_sample.h5ad")

# OR load from CSV
# counts = pd.read_csv("counts.csv", index_col=0)  # genes × cells

print(f"Data: {adata.shape[0]} cells × {adata.shape[1]} genes")

Step 3: Process and Score¶

# Convert to DataFrame if using AnnData
if 'adata' in locals():
    counts = pd.DataFrame(
        adata.X.T if hasattr(adata.X, 'toarray') else adata.X.T,
        index=adata.var_names,
        columns=adata.obs_names
    )

# Process: normalize → log → rank → z-score
processed = top.process(counts)

# Score against basis
scores = top.score(basis, processed)

print(f"Scores: {scores.shape}")  # cell_types × cells

Step 4: Interpret Results¶

import matplotlib.pyplot as plt

# Single cell example
cell_id = scores.columns[0]
cell_scores = scores[cell_id].sort_values(ascending=False)

print(f"\n{cell_id} - Top 10 matches:")
print(cell_scores.head(10))

# Visualize
top.plot_highest(cell_scores, n=15)
plt.title(f'Cell Type Scores: {cell_id}')
plt.tight_layout()
plt.show()

Step 5: Visualize trajectories¶

::

import matplotlib.pyplot as plt

top.plot_two(: scores, cell_x=’Alveolar Type 1’, cell_y=’Alveolar Type 2’, gene=’Nkx2-1’

)

Tutorial 2: Creating a Custom Reference¶

Goal: Build your own reference basis from annotated data.

Step 1: Prepare Annotated Data¶

import anndata as ad
import sctop as top

# Load your atlas with cell type labels
adata = ad.read_h5ad("annotated_atlas.h5ad")

# Verify annotations
print("Cell type column:", 'cell_type' in adata.obs.columns)
print("\nCell type counts:")
print(adata.obs['cell_type'].value_counts().head(20))

Step 2: Quality Check¶

import matplotlib.pyplot as plt

# Check cell counts per type
counts = adata.obs['cell_type'].value_counts()

plt.figure(figsize=(12, 6))
counts.head(30).plot(kind='barh')
plt.xlabel('Number of Cells')
plt.title('Cells per Type (Top 30)')
plt.tight_layout()
plt.show()

# Decide on threshold (e.g., 100 cells minimum)
threshold = 100
types_above = counts[counts >= threshold]
print(f"\n{len(types_above)} types with ≥{threshold} cells")

Step 3: Create Basis¶

# Create basis with validation
results = top.create_basis(
    adata=adata,
    cell_type_column='cell_type',
    threshold=100,
    test_size=0.2,
    random_state=42,
    n_jobs=-1,
    plot_results=True
)

# Extract results
basis = results['basis']

Step 4: Review Performance¶

# Per-type accuracy
per_type = results['per_cell_type']

print("\nBest performing (Top 10):")
print(per_type.head(10)[['accuracy', 'total']])

print("\nWorst performing (Bottom 10):")
print(per_type.tail(10)[['accuracy', 'total']])

# Check confusion
cm = results['confusion_matrix']
labels = results['confusion_matrix_labels']

# Find most confused pairs
import numpy as np
np.fill_diagonal(cm, 0)
max_confusion_idx = np.unravel_index(cm.argmax(), cm.shape)
print(f"\nMost confused pair:")
print(f"  {labels[max_confusion_idx[0]]} → {labels[max_confusion_idx[1]]}")
print(f"  {cm[max_confusion_idx]} misclassifications")

Step 5: Improve Basis (Optional)¶

# If performance is poor, try:

# Option 1: Merge similar types (especially recommended for immune and stromal cells, or cell types that only differ over/under-expression of 1 gene)
# (Do this to your adata before create_basis)
mapping = {
    'T cell CD4+': 'T cell',
    'T cell CD8+': 'T cell',
    'T cell regulatory': 'T cell'
}
adata.obs['cell_type'] = adata.obs['cell_type'].replace(mapping)

# Option 2: Drop questionably-annotated types
# (Do this to your adata before create_basis)
to_drop = ['Unknown', 'Doublet', 'Mesenchymal Stem Cell']
adata = adata[~adata.obs['cell_type'].isin(to_drop), :]

# Option 3: Increase threshold for number of cells required per cell type
results = top.create_basis(
    adata=adata,
    cell_type_column='cell_type',
    threshold=200,  # More stringent
    test_size=0.2,
    random_state=42
)

# Option 4: Use ANOVA feature selection (last resort)
results = top.create_basis(
    adata=adata,
    cell_type_column='cell_type',
    threshold=100,
    do_anova=True,
    n_features=5000,
    test_size=0.2,
)

Step 6: Save Basis¶

# Save as HDF5
basis_adata = ad.AnnData(
    X=basis.T.values,
    obs=pd.DataFrame(index=basis.columns),
    var=pd.DataFrame(index=basis.index)
)
basis_adata.write_h5ad("my_custom_basis.h5ad")

# Save as CSV (for sharing)
basis.to_csv("my_basis.csv")

print(f"Saved basis: {basis.shape}")

Tutorial 3: Gene Contribution Analysis¶

Goal: Understand which genes drive cell type assignments.

Step 1: Compute Contributions¶

import sctop as top

# From Tutorial 1, we have:
# - basis: cell type reference
# - counts: sample expression data

# Compute contributions for specific types
contributions = top.compute_gene_contributions(
    data=counts,
    basis=basis,
    cell_types=['T cell', 'B cell', 'Macrophage'],
    process_data=True
)

print("Computed contributions for:")
for ct in contributions:
    print(f"  {ct}: {contributions[ct].shape}")

Step 2: Find Top Genes¶

# For each cell type
for cell_type in ['Alveolar Type 1 Cell', 'Alveolar Type 2 Cell', 'Basal Cell']:
    contrib = contributions[cell_type]

    top_genes = top.find_top_contributing_genes(
        contrib,
        n_genes=20,
        aggregate='mean'
    )

    print(f"\n{cell_type} - Top 20 marker genes:")
    for gene, score in top_genes.items():
        print(f"  {gene}: {score:.4f}")

Step 3: Visualize Contributions¶

# Single cell type heatmap
import seaborn as sns
import matplotlib.pyplot as plt

contrib = contributions['T cell']
top_genes = top.find_top_contributing_genes(contrib, n_genes=30).index

plt.figure(figsize=(12, 10))
sns.heatmap(
    contrib.loc[top_genes],
    cmap='YlOrRd',
    cbar_kws={'label': 'Contribution'}
)
plt.title('Top 30 T Cell Gene Contributions', fontsize=16)
plt.ylabel('Genes', fontsize=14)
plt.xlabel('Cells', fontsize=14)
plt.tight_layout()
plt.show()

Tutorial 4: Cross-Validation Workflow¶

Goal: Robustly evaluate basis quality using cross-validation.

Step 1: 5-Fold Cross-Validation¶

import sctop as top

results = top.create_basis(
    adata=adata,
    cell_type_column='cell_type',
    threshold=100,
    cv_folds=5,
    random_state=42,
    n_jobs=-1,
    plot_results=False  # We'll plot manually
)

# Cross-validation summary
cv_metrics = results['cv_avg_metrics']

print("Cross-Validation Results:")
print(f"  Accuracy: {cv_metrics['accuracy_mean']:.3f} ± {cv_metrics['accuracy_std']:.3f}")
print(f"  F1 (macro): {cv_metrics['f1_macro_mean']:.3f} ± {cv_metrics['f1_macro_std']:.3f}")

Step 2: Plot Fold-by-Fold Performance¶

import matplotlib.pyplot as plt
import numpy as np

cv_results = results['cv_results']

# Extract metrics per fold
folds = [r['fold'] for r in cv_results]
accuracies = [r['metrics']['accuracy'] for r in cv_results]
f1_scores = [r['metrics']['f1_weighted'] for r in cv_results]

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.bar(folds, accuracies, color='steelblue', alpha=0.7)
ax1.axhline(np.mean(accuracies), color='red', linestyle='--', label='Mean')
ax1.set_xlabel('Fold')
ax1.set_ylabel('Accuracy')
ax1.set_title('Accuracy per Fold')
ax1.legend()
ax1.set_ylim(0, 1)

ax2.bar(folds, f1_scores, color='coral', alpha=0.7)
ax2.axhline(np.mean(f1_scores), color='red', linestyle='--', label='Mean')
ax2.set_xlabel('Fold')
ax2.set_ylabel('F1 Score (Weighted)')
ax2.set_title('F1 Score per Fold')
ax2.legend()
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

Step 3: Overall Confusion Matrix¶

from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

true_labels = results['all_true_labels']
pred_labels = results['all_predicted_labels']

# Plot confusion matrix (subset for readability)
unique_labels = sorted(set(true_labels))[:20]  # Top 20 types

fig, ax = plt.subplots(figsize=(14, 12))
ConfusionMatrixDisplay.from_predictions(
    true_labels,
    pred_labels,
    labels=unique_labels,
    normalize='true',
    xticks_rotation='vertical',
    ax=ax,
    values_format='.2f'
)
plt.title('Confusion Matrix (Top 20 Cell Types)', fontsize=16)
plt.tight_layout()
plt.show()

Best Practices¶

Data Quality¶

High-Quality Annotations: Ensure cell type labels are accurate and consistent.
Sufficient Cell Counts: Aim for at least 100 cells per type; more is better.
Use raw counts: scTOP expects raw counts, not normalized

Basis Quality¶

Check your basis quality by considering Top-1 accuracy and F1 scores. Seeing which cell types are confused for others is very informative, so make good use of the confusion matrix.

If the accuracy is bad:

Increase cell count threshold
Merge similar cell types
Remove rare or ambiguous types
Consider ANOVA feature selection as a last resort

Memory Management¶

For large datasets (>100k cells, >20k genes):

results = top.create_basis(
    adata=adata,
    cell_type_column='cell_type',
    threshold=100,
    n_jobs=4,  # Don't use all cores
    n_scoring_jobs=2,
    inner_chunk_size=500,  # Smaller chunks
    outer_chunks=100,  # More chunks
    do_anova=True,  # Reduce gene space
    n_features=5000
)

Processing speed vs memory tradeoff:

# Fast but memory-intensive
n_jobs=-1, n_scoring_jobs=8, inner_chunk_size=2000

# Slow but memory-efficient
n_jobs=2, n_scoring_jobs=1, inner_chunk_size=500