This quick start guide demonstrates the essential steps for
evaluating cell type annotations using scTypeEval. For a
comprehensive tutorial, see the main vignette.
library(Matrix)
# Generate example data
set.seed(123)
counts <- Matrix(rpois(50000, 5), nrow=500, ncol=100, sparse=TRUE)
rownames(counts) <- paste0("Gene", seq_len(500))
colnames(counts) <- paste0("Cell", seq_len(100))
metadata <- data.frame(
celltype = rep(c("TypeA", "TypeB", "TypeC", "TypeD"), each=25),
sample = rep(paste0("S", seq_len(5)), times=20),
row.names = colnames(counts)
)
# Create object
sceval <- create_scTypeEval(matrix=counts, metadata=metadata)
# Process data
sceval <- run_processing_data(
sceval,
ident = "celltype",
sample = "sample",
min_samples = 3,
min_cells = 5
)
#> # Processing data for single-cell ...
#> Transforming and filtering count matrix...
#> Normalizing count matrix via Log1p...
#> # Processing data for pseudobulk ...
#> Transforming and filtering count matrix...
#> Normalizing count matrix via Log1p...
# Identify features
sceval <- run_hvg(sceval,
var_method = "basic",
ngenes = 1000)
#> Not using black gene list
#> Computing HVG...
# Run PCA
sceval <- run_pca(sceval, ndim = 20)
#>
#> Using HVG gene list.
#> Not using black gene list
#> # Computing PCA data for single-cell ...
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Computing PCA space...
#> > Returning 20 dimensions for PCA
#> # Computing PCA data for pseudobulk ...
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Computing PCA space...
#> > Returning 19 dimensions for PCA
# Compute dissimilarity
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
reduction = TRUE
)
#> Running distance for euclidean...
# Get consistency
results <- get_consistency(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
print(results)
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype# Compute different dissimilarity methods
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
reduction = TRUE
)
#> Running distance for euclidean...
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Cosine",
reduction = TRUE
)
#> Running distance for cosine...
sceval <- run_dissimilarity(
sceval,
method = "WasserStein",
reduction = TRUE
)
#> Splitting matrices...
#> Computing pairwise WasserStein distance...
# Compare consistency across methods
dissimilarity_methods <- c("Pseudobulk:Euclidean",
"Pseudobulk:Cosine",
"WasserStein")
results_df <-
get_consistency(
sceval,
dissimilarity_slot = dissimilarity_methods, # compute for multiple dissimilarities
consistency_metric = "silhouette"
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
#> Computing internal validation metrics for Pseudobulk:Cosine ...
#> Computing internal validation metrics for WasserStein ...
results_df
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype
#> TypeA1 TypeA -0.003214443 silhouette Pseudobulk:Cosine celltype
#> TypeB1 TypeB -0.002765134 silhouette Pseudobulk:Cosine celltype
#> TypeC1 TypeC -0.013348235 silhouette Pseudobulk:Cosine celltype
#> TypeD1 TypeD -0.022767009 silhouette Pseudobulk:Cosine celltype
#> TypeA2 TypeA 0.034467020 silhouette WasserStein celltype
#> TypeB2 TypeB -0.026486747 silhouette WasserStein celltype
#> TypeC2 TypeC -0.026581239 silhouette WasserStein celltype
#> TypeD2 TypeD -0.043528190 silhouette WasserStein celltype# Compute multiple consistency metrics
consistency_metrics <- c("silhouette",
"NeighborhoodPurity",
"Average_similarity")
all_metrics <-
get_consistency(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
consistency_metric = consistency_metrics
)
#> Computing internal validation metrics for Pseudobulk:Euclidean ...
all_metrics
#> celltype measure consistency_metric dissimilarity_method ident
#> TypeA TypeA 0.009929989 silhouette Pseudobulk:Euclidean celltype
#> TypeB TypeB -0.011988536 silhouette Pseudobulk:Euclidean celltype
#> TypeC TypeC -0.005448364 silhouette Pseudobulk:Euclidean celltype
#> TypeD TypeD -0.020708784 silhouette Pseudobulk:Euclidean celltype
#> TypeA1 TypeA 0.320000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeB1 TypeB 0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeC1 TypeC 0.240000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeD1 TypeD 0.200000000 NeighborhoodPurity Pseudobulk:Euclidean celltype
#> TypeA2 TypeA 0.504617271 Average_similarity Pseudobulk:Euclidean celltype
#> TypeB2 TypeB 0.500755570 Average_similarity Pseudobulk:Euclidean celltype
#> TypeC2 TypeC 0.501285885 Average_similarity Pseudobulk:Euclidean celltype
#> TypeD2 TypeD 0.498308342 Average_similarity Pseudobulk:Euclidean celltype# Heatmap of dissimilarities
plot_heatmap(
sceval,
dissimilarity_slot = "Pseudobulk:Euclidean",
sort_consistency = "silhouette"
)
#> Computing consistency metric for silhouette.
#> Consistency computed.# Identify cell type markers
sceval <- run_gene_markers(
sceval,
method = "scran.findMarkers",
ngenes_celltype = 50
)
#> Not using black gene list
#> Computing cell type markers for celltype...
# Use markers for dissimilarity calculation
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
gene_list = "scran.findMarkers", # gene list recently added
reduction = FALSE
)
#>
#> Using scran.findMarkers gene list.
#> Not using black gene list
#> Filtering gene list...
#> Filtering empty rows and cols...
#> Running distance for euclidean...# Add custom gene list
immune_genes <- c("CD3D", "CD4", "CD8A", "CD19", "CD14", "NCAM1")
sceval <- add_gene_list(
sceval,
gene_list = list("immune_markers" = immune_genes) # add a named list
)
# Run analysis on custom genes
sceval <- run_dissimilarity(
sceval,
method = "Pseudobulk:Euclidean",
gene_list = "immune_markers" # name of the list to use
)
#> Running distance for euclidean...Low consistency scores may indicate:
plot_heatmap() or
plot_pca() to identify problematic samplesbrowseVignettes("scTypeEval")sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] SingleCellExperiment_1.35.1 SummarizedExperiment_1.43.0
#> [3] Biobase_2.73.1 GenomicRanges_1.65.0
#> [5] Seqinfo_1.3.0 IRanges_2.47.2
#> [7] S4Vectors_0.51.3 BiocGenerics_0.59.6
#> [9] generics_0.1.4 MatrixGenerics_1.25.0
#> [11] matrixStats_1.5.0 Seurat_5.5.0
#> [13] SeuratObject_5.4.0 sp_2.2-1
#> [15] Matrix_1.7-5 scTypeEval_1.1.0
#> [17] BiocStyle_2.41.0
#>
#> loaded via a namespace (and not attached):
#> [1] RColorBrewer_1.1-3 sys_3.4.3 jsonlite_2.0.0
#> [4] magrittr_2.0.5 spatstat.utils_3.2-3 farver_2.1.2
#> [7] rmarkdown_2.31 vctrs_0.7.3 ROCR_1.0-12
#> [10] spatstat.explore_3.8-1 S4Arrays_1.13.0 htmltools_0.5.9
#> [13] BiocNeighbors_2.7.2 SparseArray_1.13.2 sass_0.4.10
#> [16] sctransform_0.4.3 parallelly_1.47.0 KernSmooth_2.23-26
#> [19] bslib_0.11.0 htmlwidgets_1.6.4 ica_1.0-3
#> [22] plyr_1.8.9 plotly_4.12.0 zoo_1.8-15
#> [25] cachem_1.1.0 buildtools_1.0.0 igraph_2.3.1
#> [28] mime_0.13 lifecycle_1.0.5 pkgconfig_2.0.3
#> [31] rsvd_1.0.5 R6_2.6.1 fastmap_1.2.0
#> [34] fitdistrplus_1.2-6 future_1.70.0 shiny_1.13.0
#> [37] digest_0.6.39 patchwork_1.3.2 tensor_1.5.1
#> [40] dqrng_0.4.1 RSpectra_0.16-2 irlba_2.3.7
#> [43] beachmat_2.29.0 labeling_0.4.3 progressr_0.19.0
#> [46] spatstat.sparse_3.2-0 httr_1.4.8 polyclip_1.10-7
#> [49] abind_1.4-8 compiler_4.6.0 withr_3.0.2
#> [52] S7_0.2.2 BiocParallel_1.47.0 fastDummies_1.7.6
#> [55] MASS_7.3-65 DelayedArray_0.39.3 bluster_1.23.0
#> [58] tools_4.6.0 lmtest_0.9-40 otel_0.2.0
#> [61] httpuv_1.6.17 future.apply_1.20.2 goftest_1.2-3
#> [64] glue_1.8.1 nlme_3.1-169 promises_1.5.0
#> [67] grid_4.6.0 Rtsne_0.17 cluster_2.1.8.2
#> [70] reshape2_1.4.5 gtable_0.3.6 spatstat.data_3.1-9
#> [73] tidyr_1.3.2 data.table_1.18.4 metapod_1.21.0
#> [76] ScaledMatrix_1.21.0 BiocSingular_1.29.0 XVector_0.53.0
#> [79] spatstat.geom_3.8-1 RcppAnnoy_0.0.23 ggrepel_0.9.8
#> [82] RANN_2.6.2 pillar_1.11.1 stringr_1.6.0
#> [85] limma_3.69.1 spam_2.11-3 RcppHNSW_0.7.0
#> [88] later_1.4.8 splines_4.6.0 dplyr_1.2.1
#> [91] lattice_0.22-9 survival_3.8-6 deldir_2.0-4
#> [94] tidyselect_1.2.1 locfit_1.5-9.12 scuttle_1.23.1
#> [97] maketools_1.3.2 miniUI_0.1.2 pbapply_1.7-4
#> [100] transport_0.15-4 knitr_1.51 gridExtra_2.3
#> [103] edgeR_4.11.1 scattermore_1.2 xfun_0.57
#> [106] statmod_1.5.2 stringi_1.8.7 lazyeval_0.2.3
#> [109] yaml_2.3.12 evaluate_1.0.5 codetools_0.2-20
#> [112] tibble_3.3.1 BiocManager_1.30.27 cli_3.6.6
#> [115] uwot_0.2.4 xtable_1.8-8 reticulate_1.46.0
#> [118] jquerylib_0.1.4 Rcpp_1.1.1-1.1 globals_0.19.1
#> [121] spatstat.random_3.5-0 png_0.1-9 spatstat.univar_3.2-0
#> [124] parallel_4.6.0 ggplot2_4.0.3 dotCall64_1.2
#> [127] scran_1.41.1 listenv_0.10.1 viridisLite_0.4.3
#> [130] scales_1.4.0 ggridges_0.5.7 purrr_1.2.2
#> [133] rlang_1.2.0 cowplot_1.2.0