Made FastCAR into an actual R package, removed testing and running scripts
This commit is contained in:
parent
c3d24a4302
commit
c09281b51b
1
FastCAR
Submodule
1
FastCAR
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 2121cbd359630af53c002dd3e082621010111346
|
@ -11,6 +11,7 @@
|
|||||||
###############################################################################
|
###############################################################################
|
||||||
library(Matrix)
|
library(Matrix)
|
||||||
library(Seurat)
|
library(Seurat)
|
||||||
|
library(qlcMatrix)
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
# had to make this function to efficiently modify sparse matrices on a per gene basis
|
# had to make this function to efficiently modify sparse matrices on a per gene basis
|
||||||
|
@ -1,41 +0,0 @@
|
|||||||
###############################################################################
|
|
||||||
# FastCAR package
|
|
||||||
# Marijn Berg
|
|
||||||
# m.berg@umcg.nl
|
|
||||||
###############################################################################
|
|
||||||
# FastCAR removes ambient RNA from 10X data by looking at the profile of the
|
|
||||||
# empty droplets and removing per gene the highest value found in the ambient
|
|
||||||
# RNA if it's found to contaminate more than a certain threshold of droplets
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
# This script runs a simple correction with standard settings
|
|
||||||
library(Matrix)
|
|
||||||
library(Seurat)
|
|
||||||
library(qlcMatrix)
|
|
||||||
source("R/FastCAR_Base.R")
|
|
||||||
|
|
||||||
emptyDropletCutoff = 100 # Droplets with fewer than this number of UMIs are considered empty
|
|
||||||
contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination
|
|
||||||
|
|
||||||
cellExpressionFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/")
|
|
||||||
fullMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/")
|
|
||||||
|
|
||||||
# This folder will contain the corrected cell matrix
|
|
||||||
correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix")
|
|
||||||
|
|
||||||
# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later
|
|
||||||
cellMatrix = read.cell.matrix(cellExpressionFolder)
|
|
||||||
fullMatrix = read.full.matrix(fullMatrixFolder)
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
# This is an optional function that will show the effects of different cutoff values
|
|
||||||
# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop
|
|
||||||
ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
|
|
||||||
plot.ambient.profile(ambProfile)
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
|
|
||||||
|
|
||||||
cellMatrix = remove.background(cellMatrix, ambientProfile)
|
|
||||||
|
|
||||||
write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
|
|
@ -1,65 +0,0 @@
|
|||||||
###############################################################################
|
|
||||||
# FastCAR package
|
|
||||||
# Marijn Berg
|
|
||||||
# m.berg@umcg.nl
|
|
||||||
###############################################################################
|
|
||||||
# FastCAR removes ambient RNA from 10X data by looking at the profile of the
|
|
||||||
# empty droplets and removing per gene the highest value found in the ambient
|
|
||||||
# RNA if it's found to contaminate more than a certain threshold of droplets
|
|
||||||
###############################################################################
|
|
||||||
# This script contains functions to profile the Ambient RNA to suggest
|
|
||||||
# good settings to use to find genes to correct for
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
library(Matrix)
|
|
||||||
library(Seurat)
|
|
||||||
library(qlcMatrix)
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
# describe the number of genes identified in the background
|
|
||||||
# and the number of genes failing the contaminiation chance threshold
|
|
||||||
#
|
|
||||||
describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
|
|
||||||
genesInBackground = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
|
||||||
genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
|
||||||
nEmptyDroplets = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
|
||||||
|
|
||||||
ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
|
|
||||||
rownames(ambientDescriptions) = seq(start, stop, by)
|
|
||||||
for(emptyCutoff in seq(start, stop, by)){
|
|
||||||
nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
|
|
||||||
|
|
||||||
occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
|
|
||||||
|
|
||||||
#probability of a background read of a gene ending up in a cell
|
|
||||||
probabiltyCellContaminationPerGene = occurences / nEmpty
|
|
||||||
nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)
|
|
||||||
|
|
||||||
nGenes = sum(occurences != 0)
|
|
||||||
ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
|
|
||||||
}
|
|
||||||
return(ambientDescriptions)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
plot.ambient.profile = function(ambientProfile){
|
|
||||||
par(mfrow = c(3,1))
|
|
||||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
|
|
||||||
main = "Total number of empty droplets at cutoffs",
|
|
||||||
xlab = "empty droplet UMI cutoff",
|
|
||||||
ylab = "Number of empty droplets")
|
|
||||||
|
|
||||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
|
|
||||||
main = "Number of genes in ambient RNA",
|
|
||||||
xlab = "empty droplet UMI cutoff",
|
|
||||||
ylab = "Genes in empty droplets")
|
|
||||||
|
|
||||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
|
|
||||||
main = "number of genes to correct",
|
|
||||||
xlab = "empty droplet UMI cutoff",
|
|
||||||
ylab = "Genes identified as contamination")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
102
R/tmp.R
102
R/tmp.R
@ -1,102 +0,0 @@
|
|||||||
cutoffEmpty = 100 # set this to NA to determine dynamicallly
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
# remove this or things will keep getting added to it
|
|
||||||
rm(biopsySeurat, backGroundOverviews)
|
|
||||||
|
|
||||||
for(folder in dataFolders[1]){
|
|
||||||
gc()
|
|
||||||
donor = str_sub(folder, -7,-1)
|
|
||||||
sample = str_sub(folder, 19)
|
|
||||||
sample = str_sub(sample, 1, -9)
|
|
||||||
|
|
||||||
allExpression = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = ""))
|
|
||||||
colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "")
|
|
||||||
# change barcode ids to match r object
|
|
||||||
|
|
||||||
if(is.na(cutoffEmpty)){
|
|
||||||
##########################################
|
|
||||||
# determine the number of genes in the empty droplets at various cutoffs
|
|
||||||
|
|
||||||
foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps)))
|
|
||||||
names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps))
|
|
||||||
|
|
||||||
for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){
|
|
||||||
nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue])
|
|
||||||
nExpressedCells[nExpressedCells > 0] = 1
|
|
||||||
foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells)
|
|
||||||
}
|
|
||||||
write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv"))
|
|
||||||
##########################################
|
|
||||||
# Find the point where increase in the number of genes decreases the most
|
|
||||||
|
|
||||||
cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))]))
|
|
||||||
|
|
||||||
# cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value
|
|
||||||
|
|
||||||
##########################################
|
|
||||||
png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800)
|
|
||||||
plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes, main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <")
|
|
||||||
segments(cutoffEmpty, 0 ,cutoffEmpty, 25000)
|
|
||||||
dev.off()
|
|
||||||
}
|
|
||||||
|
|
||||||
# read this in as a full matrix, haven't figgered out how to do this on a sparse matrix
|
|
||||||
cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]]
|
|
||||||
|
|
||||||
if(!exists("biopsySeurat")){
|
|
||||||
biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
|
|
||||||
biopsySeurat[["orig.ident"]] = donor
|
|
||||||
biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-")
|
|
||||||
# biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) )
|
|
||||||
}else{
|
|
||||||
tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
|
|
||||||
tmpbiopsySeurat[["orig.ident"]] = donor
|
|
||||||
|
|
||||||
tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-")
|
|
||||||
# tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE))
|
|
||||||
|
|
||||||
biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
backgroundTotal = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty])
|
|
||||||
backGroundMax = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty]))
|
|
||||||
|
|
||||||
nCell = ncol(cellExpression)
|
|
||||||
# droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
|
|
||||||
nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2]
|
|
||||||
occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0)
|
|
||||||
|
|
||||||
#probability of a background read of a gene ending up in a cell
|
|
||||||
pCell = occurences / nEmpty
|
|
||||||
|
|
||||||
write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = ""))
|
|
||||||
write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = ""))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero)
|
|
||||||
backGroundMax[pCell < acceptableFractionContaminated] = 0
|
|
||||||
|
|
||||||
# remove the background
|
|
||||||
cellExpression = apply(cellExpression , 2, '-', backGroundMax)
|
|
||||||
cellExpression[cellExpression < 0] = 0
|
|
||||||
# cellExpression = as.sparse(cellExpression)
|
|
||||||
|
|
||||||
if(!exists("biopsySeuratAC")){
|
|
||||||
biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
|
|
||||||
biopsySeuratAC[["orig.ident"]] = donor
|
|
||||||
biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-")
|
|
||||||
# biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) )
|
|
||||||
}else{
|
|
||||||
tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
|
|
||||||
tmpbiopsySeuratAC[["orig.ident"]] = donor
|
|
||||||
|
|
||||||
tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-")
|
|
||||||
# tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE))
|
|
||||||
|
|
||||||
biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC)
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user