Overview

This quick start guide demonstrates the essential steps for evaluating cell type annotations using scTypeEval. For a comprehensive tutorial, see the main vignette.

library(scTypeEval)

Minimal Workflow

From a Count Matrix

library(Matrix)

# Generate example data
set.seed(123)
counts <- Matrix(rpois(50000, 5), nrow=500, ncol=100, sparse=TRUE)
rownames(counts) <- paste0("Gene", seq_len(500))
colnames(counts) <- paste0("Cell", seq_len(100))

metadata <- data.frame(
  celltype = rep(c("TypeA", "TypeB", "TypeC", "TypeD"), each=25),
  sample = rep(paste0("S", seq_len(5)), times=20),
  row.names = colnames(counts)
)

# Create object
sceval <- create_scTypeEval(matrix=counts, metadata=metadata)

# Process data
sceval <- run_processing_data(
  sceval,
  ident = "celltype",
  sample = "sample",
  min_samples = 3,
  min_cells = 5
)
#> # Processing data for single-cell ...
#>    Transforming and filtering count matrix...
#>    Normalizing count matrix via Log1p...
#> # Processing data for pseudobulk ...
#>    Transforming and filtering count matrix...
#>    Normalizing count matrix via Log1p...

# Identify features
sceval <- run_hvg(sceval,
                  var_method = "basic",
                  ngenes = 1000)
#> Not using black gene list
#> Computing HVG...

# Run PCA
sceval <- run_pca(sceval, ndim = 20)
#> 
#> Using HVG gene list.
#> Not using black gene list
#> # Computing PCA data for single-cell ...
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Computing PCA space...
#>    > Returning 20 dimensions for PCA
#> # Computing PCA data for pseudobulk ...
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Computing PCA space...
#>    > Returning 19 dimensions for PCA

# Compute dissimilarity
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  reduction = TRUE
)
#>    Running distance for euclidean...

# Get consistency
results <- get_consistency(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
print(results)
#>       celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA    TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB    TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC    TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD    TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype

From a Seurat Object

library(Seurat)

# Create Seurat object with example data generated earlier
seurat_obj <- Seurat::CreateSeuratObject(
  counts = counts,
  meta.data = metadata
)

sceval_seurat <- create_scTypeEval(seurat_obj)

# Continue with standard workflow

From a SingleCellExperiment Object

library(SingleCellExperiment)

# Create SCE object with example data generated earlier
sce <- SingleCellExperiment::SingleCellExperiment(
  assays = list(counts = counts),
  colData = metadata
)

sceval_sce <- create_scTypeEval(sce)

# Continue with workflow as above

Common Use Cases

Compare Multiple Dissimilarity Methods

# Compute different dissimilarity methods
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  reduction = TRUE
)
#>    Running distance for euclidean...
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Cosine",
  reduction = TRUE
)
#>    Running distance for cosine...
sceval <- run_dissimilarity(
  sceval,
  method = "WasserStein",
  reduction = TRUE
)
#> Splitting matrices...
#> Computing pairwise WasserStein distance...

# Compare consistency across methods
dissimilarity_methods <- c("Pseudobulk:Euclidean",
                           "Pseudobulk:Cosine",
                           "WasserStein")
results_df <- 
  get_consistency(
    sceval,
    dissimilarity_slot = dissimilarity_methods, # compute for multiple dissimilarities
    consistency_metric = "silhouette"
  )
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
#> Computing internal validation metrics for Pseudobulk:Cosine ...
#> Computing internal validation metrics for WasserStein ...

results_df
#>        celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA     TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB     TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC     TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD     TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype
#> TypeA1    TypeA -0.003214443         silhouette    Pseudobulk:Cosine celltype
#> TypeB1    TypeB -0.002765134         silhouette    Pseudobulk:Cosine celltype
#> TypeC1    TypeC -0.013348235         silhouette    Pseudobulk:Cosine celltype
#> TypeD1    TypeD -0.022767009         silhouette    Pseudobulk:Cosine celltype
#> TypeA2    TypeA  0.034467020         silhouette          WasserStein celltype
#> TypeB2    TypeB -0.026486747         silhouette          WasserStein celltype
#> TypeC2    TypeC -0.026581239         silhouette          WasserStein celltype
#> TypeD2    TypeD -0.043528190         silhouette          WasserStein celltype

Evaluate Multiple Consistency Metrics

# Compute multiple consistency metrics
consistency_metrics <- c("silhouette",
                         "NeighborhoodPurity",
                         "Average_similarity")

all_metrics <- 
  get_consistency(
    sceval,
    dissimilarity_slot = "Pseudobulk:Euclidean",
    consistency_metric = consistency_metrics
  )
#> Computing internal validation metrics for Pseudobulk:Euclidean ...

all_metrics
#>        celltype      measure consistency_metric dissimilarity_method    ident
#> TypeA     TypeA  0.009929989         silhouette Pseudobulk:Euclidean celltype
#> TypeB     TypeB -0.011988536         silhouette Pseudobulk:Euclidean celltype
#> TypeC     TypeC -0.005448364         silhouette Pseudobulk:Euclidean celltype
#> TypeD     TypeD -0.020708784         silhouette Pseudobulk:Euclidean celltype
#> TypeA1    TypeA  0.320000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeB1    TypeB  0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeC1    TypeC  0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeD1    TypeD  0.200000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeA2    TypeA  0.504617271 Average_similarity Pseudobulk:Euclidean celltype
#> TypeB2    TypeB  0.500755570 Average_similarity Pseudobulk:Euclidean celltype
#> TypeC2    TypeC  0.501285885 Average_similarity Pseudobulk:Euclidean celltype
#> TypeD2    TypeD  0.498308342 Average_similarity Pseudobulk:Euclidean celltype

Visualize Results

# Heatmap of dissimilarities
plot_heatmap(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  sort_consistency = "silhouette"
)
#> Computing consistency metric for silhouette.
#> Consistency computed.


# Pseudobulk PCA per sample & cell type
plot_pca(
  sceval,
  reduction_slot = "pseudobulk"
)

Using Marker Genes Instead of HVGs

# Identify cell type markers
sceval <- run_gene_markers(
  sceval,
  method = "scran.findMarkers",
  ngenes_celltype = 50
)
#> Not using black gene list
#> Computing cell type markers for celltype...

# Use markers for dissimilarity calculation
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  gene_list = "scran.findMarkers", # gene list recently added
  reduction = FALSE
)
#> 
#> Using scran.findMarkers gene list.
#> Not using black gene list
#>    Filtering gene list...
#>    Filtering empty rows and cols...
#>    Running distance for euclidean...

Focus on Specific Gene Sets

# Add custom gene list
immune_genes <- c("CD3D", "CD4", "CD8A", "CD19", "CD14", "NCAM1")
sceval <- add_gene_list(
  sceval,
  gene_list = list("immune_markers" = immune_genes) # add a named list
)

# Run analysis on custom genes
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  gene_list = "immune_markers" # name of the list to use
)
#>    Running distance for euclidean...

Interpreting Results

What Low Scores Mean

Low consistency scores may indicate:

Ambiguous cell type boundaries between related types
Heterogeneous populations needing refinement
Annotation inconsistencies across samples

Next Steps for Low-Scoring Cell Types

Visualize using plot_heatmap() or plot_pca() to identify problematic samples
Investigate biological differences (e.g., disease vs. healthy)
Refine annotations by splitting or merging cell types

Available Methods and Metrics

Dissimilarity Methods

Pseudobulk:Euclidean - Euclidean distance on pseudobulk profiles
Pseudobulk:Cosine - Cosine distance on pseudobulk profiles
Pseudobulk:Pearson - Pearson correlation distance on pseudobulk profiles
WasserStein - Wasserstein distance between cell distributions
recip_classif:Match - Reciprocal classification matching
recip_classif:Score - Reciprocal classification scoring

Consistency Metrics

silhouette - Standard silhouette coefficient
2label_silhouette - Two-label silhouette variant
NeighborhoodPurity - K-nearest neighbor purity
ward_PropMatch - Ward clustering proportion match
Orbital_medoid - Medoid-based orbital metric
Average_similarity - Average within-group similarity

Tips and Best Practices

Always use multiple samples (minimum 3-5 per cell type)
Compare different methods - no single method is perfect
Use PCA for speed - similar results, much faster
Start with HVGs - then try marker genes if needed
Check sample sizes - ensure adequate cells per type per sample
Interpret in context - consider biological heterogeneity

Getting Help

GitHub: https://github.com/carmonalab/scTypeEval
Issues: https://github.com/carmonalab/scTypeEval/issues
Main vignette: browseVignettes("scTypeEval")

Session Info

sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SingleCellExperiment_1.35.2 SummarizedExperiment_1.43.0
#>  [3] Biobase_2.73.1              GenomicRanges_1.65.1       
#>  [5] Seqinfo_1.3.0               IRanges_2.47.2             
#>  [7] S4Vectors_0.51.5            BiocGenerics_0.59.10       
#>  [9] generics_0.1.4              MatrixGenerics_1.25.0      
#> [11] matrixStats_1.5.0           Seurat_5.5.1               
#> [13] SeuratObject_5.4.0          sp_2.2-1                   
#> [15] Matrix_1.7-5                scTypeEval_1.1.2           
#> [17] BiocStyle_2.41.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3     sys_3.4.3              jsonlite_2.0.0        
#>   [4] magrittr_2.0.5         spatstat.utils_3.2-3   farver_2.1.2          
#>   [7] rmarkdown_2.31         vctrs_0.7.3            ROCR_1.0-12           
#>  [10] spatstat.explore_3.8-1 S4Arrays_1.13.0        htmltools_0.5.9       
#>  [13] BiocNeighbors_2.7.2    SparseArray_1.13.2     sass_0.4.10           
#>  [16] sctransform_0.4.3      parallelly_1.48.0      KernSmooth_2.23-26    
#>  [19] bslib_0.11.0           htmlwidgets_1.6.4      ica_1.0-3             
#>  [22] plyr_1.8.9             plotly_4.12.0          zoo_1.8-15            
#>  [25] cachem_1.1.0           buildtools_1.0.0       igraph_2.3.3          
#>  [28] mime_0.13              lifecycle_1.0.5        pkgconfig_2.0.3       
#>  [31] rsvd_1.0.5             R6_2.6.1               fastmap_1.2.0         
#>  [34] fitdistrplus_1.2-6     future_1.70.0          shiny_1.14.0          
#>  [37] digest_0.6.39          patchwork_1.3.2        tensor_1.5.1          
#>  [40] dqrng_0.4.1            RSpectra_0.16-2        irlba_2.3.7           
#>  [43] beachmat_2.29.0        labeling_0.4.3         progressr_1.0.0       
#>  [46] spatstat.sparse_3.2-0  httr_1.4.8             polyclip_1.10-7       
#>  [49] abind_1.4-8            compiler_4.6.1         withr_3.0.3           
#>  [52] S7_0.2.2               BiocParallel_1.47.0    fastDummies_1.7.6     
#>  [55] MASS_7.3-66            DelayedArray_0.39.3    bluster_1.23.0        
#>  [58] tools_4.6.1            lmtest_0.9-40          otel_0.2.0            
#>  [61] httpuv_1.6.17          future.apply_1.20.2    goftest_1.2-3         
#>  [64] glue_1.8.1             nlme_3.1-170           promises_1.5.0        
#>  [67] grid_4.6.1             Rtsne_0.17             cluster_2.1.8.2       
#>  [70] reshape2_1.4.5         gtable_0.3.6           spatstat.data_3.1-9   
#>  [73] tidyr_1.3.2            data.table_1.18.4      metapod_1.21.0        
#>  [76] ScaledMatrix_1.21.0    BiocSingular_1.29.0    XVector_0.53.0        
#>  [79] spatstat.geom_3.8-1    RcppAnnoy_0.0.23       ggrepel_0.9.8         
#>  [82] RANN_2.6.2             pillar_1.11.1          stringr_1.6.0         
#>  [85] limma_3.69.2           spam_2.11-4            RcppHNSW_0.7.0        
#>  [88] later_1.4.8            splines_4.6.1          dplyr_1.2.1           
#>  [91] lattice_0.22-9         survival_3.8-9         deldir_2.0-4          
#>  [94] tidyselect_1.2.1       locfit_1.5-9.12        scuttle_1.23.1        
#>  [97] maketools_1.3.2        miniUI_0.1.2           pbapply_1.7-4         
#> [100] transport_0.15-4       knitr_1.51             gridExtra_2.3.1       
#> [103] edgeR_4.11.4           scattermore_1.2        xfun_0.60             
#> [106] statmod_1.5.2          stringi_1.8.7          lazyeval_0.2.3        
#> [109] yaml_2.3.12            evaluate_1.0.5         codetools_0.2-20      
#> [112] tibble_3.3.1           BiocManager_1.30.27    cli_3.6.6             
#> [115] uwot_0.2.4             xtable_1.8-8           reticulate_1.46.0     
#> [118] jquerylib_0.1.4        Rcpp_1.1.2             globals_0.19.1        
#> [121] spatstat.random_3.5-0  png_0.1-9              spatstat.univar_3.2-0 
#> [124] parallel_4.6.1         ggplot2_4.0.3          dotCall64_1.2         
#> [127] scran_1.41.1           listenv_1.0.0          viridisLite_0.4.3     
#> [130] scales_1.4.0           ggridges_0.5.7         purrr_1.2.2           
#> [133] rlang_1.3.0            cowplot_1.2.0

Quick Start Guide for scTypeEval