##Introduction

We’ll review the current practices for some of the common steps in scRNA-seq data analysis in the Sessions 1-2. We’ll discuss different practices for each step and the assumptions underlying various tools and their limitations. This document provides an exposure to one of the popular tools for such analysis. As we’ll discuss, the right choice of method in any application depends on a number of factors, including the biological systems under study and the characteristics of the data in hand. For the purpose of our workshop, we’ll limit the hands-on component to the Seurat package in R. In general, analysis might require multiple tools in different languages and/or novel development. For more, please see the slide deck in materials.

The following is based on this vignette from the Seurat developers.

Setup the working environment

library(dplyr)
library(Seurat)
library(patchwork)

Load the data

Please ensure that the directory named “pbmc3k_data” in the workshop materials is in the same directory as this .Rmd file.

data <- CreateSeuratObject(counts = Read10X("data"),
                           project = "Hello_scWorld",  #Name this whatever.
                           min.cells = 3,  # Don't keep genes observed in fewer than 3 cells
                           min.features = 200,  # Don't keep cells with fewer than 200 genes
                           names.delim = NULL)  # Don't try and parse the sample names

Filter poor quality or uninteresting cells

#Assess percent of mitochondrial counts in each cell 
data[["percent_mt"]] <- PercentageFeatureSet(object = data, 
                                             pattern = "^mt-")

#Violin plot
VlnPlot(object = data, 
        features = c("nFeature_RNA", 
                   "nCount_RNA", 
                   "percent_mt"), 
        ncol = 3)

#Other plotting otions
plot1 <- FeatureScatter(object = data, 
                        feature1 = "nCount_RNA", 
                        feature2 = "percent_mt")
plot2 <- FeatureScatter(object = data,
                        feature1 = "nCount_RNA", 
                        feature2 = "nFeature_RNA")
plot1 + plot2

#Subset data
data <- subset(x = data, 
               subset =  nFeature_RNA > 200 & 
                 nCount_RNA > 950000 & 
                 percent_mt < 20)

VlnPlot(object = data, 
        features = c("nFeature_RNA",
                   "nCount_RNA",
                   "percent_mt"),
        ncol = 3)

Normalization

data <- NormalizeData(object = data, 
                      normalization.method = "LogNormalize", 
                      scale.factor = 10000)

Feature selection

data <- FindVariableFeatures( object = data, 
                              selection.method = "vst", 
                              nfeatures = 2000)

#View the 10 most highly variable genes
top10 <- head(x = VariableFeatures(object = data), 10)
print(top10)
##  [1] "Gm10800" "Ccl21a"  "Ccl19"   "Gm21541" "Glycam1" "Gm2564"  "Ccl19.1"
##  [8] "Gm10801" "Ccl20"   "Gzma"
#Seurat allows plotting variable features with and without labels
plot1 <- VariableFeaturePlot(object = data)
plot2 <- LabelPoints(plot = plot1, 
                     points = top10, 
                     repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 
## Warning: Transformation introduced infinite values in continuous x-axis

plot2
## Warning: Transformation introduced infinite values in continuous x-axis

Dimensionality reduction

Linear dimensionality reduction

#By default Seurat only scales the variable features.
#Explicit input required to rescale all the genes
scale_genes <- rownames(data)
data <- ScaleData(object = data, 
                  features = scale_genes)
## Centering and scaling data matrix
# Use the highly variable genes to find principal components
data <- RunPCA(object = data,
               features = VariableFeatures(object = data),  
               verbose = FALSE)  

#Examine and visualize PCA results a few different ways
print(x = data[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  Serping1, Col3a1, Col1a2, Col1a1, Serpinf1 
## Negative:  Cd74, Tyrobp, H2-Aa, H2-Ab1, Fcer1g 
## PC_ 2 
## Positive:  Plvap, Lrg1, Cldn5, Ctla2a, Fabp4 
## Negative:  Lyz2, Cd68, Fcer1g, Tyrobp, Lgals3 
## PC_ 3 
## Positive:  Nkg7, Ctsw, Icos, Gm26917, Cd27 
## Negative:  Plvap, Lrg1, Ms4a6d, Lyz2, Tmem88 
## PC_ 4 
## Positive:  Slc45a2, Tyrp1, Car14, Dct, Tspan10 
## Negative:  Cfb, Lrg1, Clu, Gm26905, Plvap 
## PC_ 5 
## Positive:  Nkg7, Actb, Ctsw, AW112010, Gzmb 
## Negative:  Gm11168, Gm26870, Gm10717, Gm21738, Gm10719
DimPlot(data, reduction = "pca")

data <- JackStraw(object = data, num.replicate = 100)
data <- ScoreJackStraw(object = data, dims = 1:20)
JackStrawPlot(data, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (`geom_point()`).

ElbowPlot(data)

VizDimLoadings(object = data, dims = 1:2, reduction = "pca")

Nonlinear dimensionality reduction

data <- RunUMAP(data, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:18:04 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:18:04 Read 5438 rows and found 10 numeric columns
## 10:18:04 Using Annoy for neighbor search, n_neighbors = 30
## 10:18:04 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:18:05 Writing NN index file to temp file /var/folders/1t/y84b3_bs2wq0l8ks9pr15jmw0000gq/T//RtmpEYxVdI/file687b79e58625
## 10:18:05 Searching Annoy index using 1 thread, search_k = 3000
## 10:18:06 Annoy recall = 100%
## 10:18:06 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:18:06 Initializing from normalized Laplacian + noise (using irlba)
## 10:18:07 Commencing optimization for 500 epochs, with 228306 positive edges
## 10:18:15 Optimization finished
data <- RunTSNE(data, dims = 1:10)

DimPlot(data, reduction = "umap")

DimPlot(data, reduction = "tsne")

Clustering

data <- FindNeighbors(data, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
data <- FindClusters(data, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5438
## Number of edges: 177873
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9331
## Number of communities: 18
## Elapsed time: 0 seconds
DimPlot(data, reduction = "umap", label = TRUE) 

DimPlot(data, reduction = "tsne", label = TRUE)

DimPlot(data, reduction = "pca", label = TRUE)

Save the Seurat object

test <- data[, 1:10 ]
saveRDS(test, file = "hello_scWorld.rds")

Find markers

# find all markers of cluster 1
cluster1.markers <- FindMarkers(data, 
                                ident.1 = 1, 
                                min.pct = 0.25)
head(cluster1.markers, n = 5)
##       p_val avg_log2FC pct.1 pct.2 p_val_adj
## Spi1      0   1.672875 0.914 0.208         0
## Cybb      0   1.862620 0.857 0.147         0
## Fcgr1     0   1.924539 0.572 0.034         0
## Cd68      0   3.266696 0.923 0.191         0
## Ccl6      0   4.016911 0.819 0.091         0
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(data, 
                                ident.1 = 5, 
                                ident.2 = c(0, 3), 
                                min.pct = 0.25)
head(cluster5.markers,
     n = 5)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## Sparc   0.000000e+00   4.614014 0.927 0.019  0.000000e+00
## Igfbp7 5.582004e-293   5.709793 0.850 0.021 1.191870e-288
## Ptrf   3.510770e-272   3.342912 0.774 0.009 7.496196e-268
## Clu    1.710715e-265   3.763025 0.784 0.016 3.652719e-261
## Plvap  1.616214e-262   4.448397 0.740 0.004 3.450939e-258
# find markers for every cluster compared to all remaining cells, report only the positive ones
data.markers <- FindAllMarkers(data, 
                               only.pos = TRUE, 
                               min.pct = 0.25, 
                               logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
data.markers %>%
  group_by(., cluster) %>% 
  top_n(., n = 2, wt = avg_log2FC)
## # A tibble: 36 × 7
## # Groups:   cluster [18]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
##  1 0               2.73 0.735 0.139 0         0       Tcf7   
##  2 3.68e-180       2.45 0.907 0.55  7.85e-176 0       Vps37b 
##  3 0               5.82 0.984 0.332 0         1       Lyz2   
##  4 2.35e-202       4.02 0.407 0.049 5.01e-198 1       Chil3  
##  5 5.55e-239       2.82 0.811 0.24  1.19e-234 2       Plbd1  
##  6 1.13e-150       3.19 0.938 0.789 2.42e-146 2       Cst3   
##  7 0               3.76 0.745 0.112 0         3       Cxcr6  
##  8 0               3.40 0.838 0.155 0         3       Icos   
##  9 0               4.46 0.935 0.038 0         4       Gpihbp1
## 10 0               3.68 0.938 0.104 0         4       Esam   
## # … with 26 more rows

Additional visualizations

VlnPlot(data, features = c("Vps37b", "Tcf7"))

FeaturePlot(data, features = c("Vps37b", "Tcf7"))

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] patchwork_1.1.2    SeuratObject_4.1.3 Seurat_4.3.0       dplyr_1.0.10      
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.7.1       Rtsne_0.16             colorspace_2.0-3      
##   [4] deldir_1.0-6           ellipsis_0.3.2         ggridges_0.5.4        
##   [7] rstudioapi_0.14        spatstat.data_3.0-0    farver_2.1.1          
##  [10] leiden_0.4.3           listenv_0.9.0          ggrepel_0.9.2         
##  [13] fansi_1.0.3            codetools_0.2-18       splines_4.1.2         
##  [16] cachem_1.0.6           knitr_1.41             polyclip_1.10-4       
##  [19] jsonlite_1.8.4         ica_1.0-3              cluster_2.1.4         
##  [22] png_0.1-8              uwot_0.1.14            shiny_1.7.4           
##  [25] sctransform_0.3.5      spatstat.sparse_3.0-0  compiler_4.1.2        
##  [28] httr_1.4.4             assertthat_0.2.1       Matrix_1.5-1          
##  [31] fastmap_1.1.0          lazyeval_0.2.2         limma_3.50.3          
##  [34] cli_3.5.0              later_1.3.0            htmltools_0.5.4       
##  [37] tools_4.1.2            igraph_1.3.5           gtable_0.3.1          
##  [40] glue_1.6.2             RANN_2.6.1             reshape2_1.4.4        
##  [43] Rcpp_1.0.9             scattermore_0.8        jquerylib_0.1.4       
##  [46] vctrs_0.5.1            nlme_3.1-161           spatstat.explore_3.0-5
##  [49] progressr_0.12.0       lmtest_0.9-40          spatstat.random_3.0-1 
##  [52] xfun_0.35              stringr_1.5.0          globals_0.16.2        
##  [55] mime_0.12              miniUI_0.1.1.1         lifecycle_1.0.3       
##  [58] irlba_2.3.5.1          goftest_1.2-3          future_1.30.0         
##  [61] MASS_7.3-58.1          zoo_1.8-11             scales_1.2.1          
##  [64] promises_1.2.0.1       spatstat.utils_3.0-1   parallel_4.1.2        
##  [67] RColorBrewer_1.1-3     yaml_2.3.6             reticulate_1.26       
##  [70] pbapply_1.6-0          gridExtra_2.3          ggrastr_1.0.1         
##  [73] ggplot2_3.4.0          sass_0.4.4             stringi_1.7.8         
##  [76] highr_0.9              rlang_1.0.6            pkgconfig_2.0.3       
##  [79] matrixStats_0.63.0     evaluate_0.19          lattice_0.20-45       
##  [82] ROCR_1.0-11            purrr_1.0.0            tensor_1.5            
##  [85] labeling_0.4.2         htmlwidgets_1.6.0      cowplot_1.1.1         
##  [88] tidyselect_1.2.0       parallelly_1.33.0      RcppAnnoy_0.0.20      
##  [91] plyr_1.8.8             magrittr_2.0.3         R6_2.5.1              
##  [94] generics_0.1.3         DBI_1.1.3              withr_2.5.0           
##  [97] pillar_1.8.1           fitdistrplus_1.1-8     survival_3.4-0        
## [100] abind_1.4-5            sp_1.5-1               tibble_3.1.8          
## [103] future.apply_1.10.0    crayon_1.5.2           KernSmooth_2.23-20    
## [106] utf8_1.2.2             spatstat.geom_3.0-3    plotly_4.10.1         
## [109] rmarkdown_2.19         grid_4.1.2             data.table_1.14.6     
## [112] digest_0.6.31          xtable_1.8-4           tidyr_1.2.1           
## [115] httpuv_1.6.7           munsell_0.5.0          beeswarm_0.4.0        
## [118] viridisLite_0.4.1      vipor_0.4.5            bslib_0.4.2

THE END.