#!/bin/bash
# Create data directory to store downloaded files
mkdir -p data/filtered_counts
# Metadata csv file
wget wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE160nnn/GSE160585/suppl/GSE160585%5Fmetadata%5Ffor%5Fpseudotime%5Fand%5Fpseudobulk%5FDGE.csv.gz -O data/meta.csv.gz
# Features, barcodes, and counts matrix
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE160nnn/GSE160585/suppl/GSE160585%5Ffiltered%5Fraw%5Fcounts%5Ffor%5Fpseudotime%5Fand%5Fpseudobulk%5FDGE.tar.gz -O data/filtered_counts.tar.gz
# Unzip and decompress the files
tar -xvf data/filtered_counts.tar.gz -C data/filtered_counts
gunzip data/meta.csv.gz
Sample pre-processing
Approximate time: 15 minutes
Learning Objectives
- Understand the steps taken to generate the Seurat object used as input for the workshop.
Sample data
For this workshop, we will be working with a single-cell RNA-seq dataset from Tseng et al, 2021. The data is available on GEO GSE160585. The files we will need to create the fully processed Seurat object include:
- Metadata csv file
- Counts matrix
- List of features (genes)
- List of cell barcodes
We have an entire workshop of materials that go through the whole process step-by-step on how to generate a similarly fully annotated, filtered dataset from single-cell RNA-seq data. In this lesson we provide mostly code, so you can reproduce the object yourself. If you want an in-depth explanation of each step we encourage you to peruse the materials linked above.
Pre-processing steps
- Download and unzip the dataset from GEO using bash:
- Data wrangling of the metadata
library(tidyverse)
<- read.csv("data/meta.csv", row.names=1)
meta
# Celltype IDs have are formatted like:
# {celltype}_{cluster}
# Removing the underscore
$celltype <- sub("_.*", "", meta$cluster_id)
meta<- select(meta, -c(cluster_id))
meta
# The following columns in the metadata have duplicate values:
# - nCount_RNA = nUMI
# - nFeature_RNA = nGene
<- select(meta, -c(nUMI, nGene))
meta
# Rename columns for more clarity
<- meta %>%
meta rename(c("orig.ident"="sample", "sample"="condition"))
# Removing cluster resolutions that will not be used
<- c(
cols "integrated_snn_res.0.1",
"integrated_snn_res.0.4",
"integrated_snn_res.0.6",
"integrated_snn_res.0.8",
"integrated_snn_res.1",
# "integrated_snn_res.1.2",
"integrated_snn_res.1.4",
"integrated_snn_res.1.8",
"seurat_clusters"
)
<- meta %>% select(-c(cols))
meta
# Store clusters IDs as factors
$seurat_clusters <- meta$integrated_snn_res.1.2
meta<- sort(as.integer(unique(meta$seurat_clusters)))
sorted_cluster $seurat_clusters <- factor(meta$seurat_clusters, levels=sorted_cluster) meta
- Generate Seurat object using downloaded files as input
library(Seurat)
library(Matrix)
set.seed(1454944673L) # Using the same seed used in the paper
# Load metadata, barcodes, features, and matrix into R
<- read.csv("data/filtered_counts/barcodes_filtered_raw_counts_for_pseudotime_and_pseudobulk_DGE.tsv", header=FALSE)
barcodes <- read.csv("data/filtered_counts/genes_filtered_raw_counts_for_pseudotime_and_pseudobulk_DGE.tsv", header=FALSE)
features <- readMM("data/filtered_counts/filtered_raw_counts_for_pseudotime_and_pseudobulk_DGE.mtx")
counts
# Add gene and cell barcode information to count matrix
row.names(counts) <- features$V1
colnames(counts) <- barcodes$V1
# Create seurat object
<- CreateSeuratObject(
seurat
counts, project = "GSE160585",
assay = "RNA",
meta.data = meta)
In the next few steps we have provided the code to process the Seurat object. The parameters for each step were chosen based upon the descriptions provided in the Methods section of the paper.
- Log-normalization and highly variable genes
# Log normalization
<- NormalizeData(seurat)
seurat
# Identify the most variable genes
<- FindVariableFeatures(seurat,
seurat selection.method = "vst",
nfeatures = 3000,
verbose = FALSE)
- SCTransform and regress out cell cycle scores
# Split seurat object by sample
<- SplitObject(seurat, split.by = "condition")
split_seurat
# Allow R to use more memory
options(future.globals.maxSize = 4000 * 1024^2)
# Run SCTranform on each sample individually
for (i in 1:length(split_seurat)) {
# Regress out cell cycle scores
<- SCTransform(split_seurat[[i]],
split_seurat[[i]] vars.to.regress = c("S.Score", "G2M.Score"),
vst.flavor = "v2",
variable.features.n = 3000)
}
- CCA integration
# Select the most variable features to use for integration
<- SelectIntegrationFeatures(object.list = split_seurat,
integ_features nfeatures = 3000)
# Prepare the SCT list object for integration
<- PrepSCTIntegration(object.list = split_seurat,
split_seurat anchor.features = integ_features)
# Find best buddies - can take a while to run
<- FindIntegrationAnchors(object.list = split_seurat,
integ_anchors normalization.method = "SCT",
anchor.features = integ_features)
# Integrate across conditions
<- IntegrateData(anchorset = integ_anchors,
seurat_integrated normalization.method = "SCT")
# Rejoin the layers in the RNA assay that we split earlier
"RNA"]] <- JoinLayers(seurat_integrated[["RNA"]]) seurat_integrated[[
- PCA, nearest neighbors, UMAP
<- RunPCA(seurat_integrated, verbose = FALSE)
seurat_integrated <- RunUMAP(seurat_integrated, dims = 1:50)
seurat_integrated <- FindNeighbors(seurat_integrated, dims = 1:50) seurat_integrated
- Save seurat object
Idents(seurat_integrated) <- "condition"
DefaultAssay(seurat_integrated) <- "RNA"
saveRDS(seurat_integrated, "data/BAT_GSE160585.rds")