Compare commits
1 Commits
master
...
67afc86242
Author | SHA1 | Date | |
---|---|---|---|
67afc86242 |
@ -1,2 +0,0 @@
|
||||
^.*\.Rproj$
|
||||
^\.Rproj\.user$
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,4 +0,0 @@
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
.RData
|
||||
.Ruserdata
|
Binary file not shown.
Before Width: | Height: | Size: 96 KiB |
Binary file not shown.
Before Width: | Height: | Size: 98 KiB |
192
R/FastCAR_Base.R
192
R/FastCAR_Base.R
@ -7,15 +7,11 @@
|
||||
# empty droplets and removing per gene the highest value found in the ambient
|
||||
# RNA if it's found to contaminate more than a certain threshold of droplets
|
||||
###############################################################################
|
||||
# TODO
|
||||
#
|
||||
# This file contains base functions
|
||||
###############################################################################
|
||||
library(Matrix)
|
||||
library(Seurat)
|
||||
library(qlcMatrix)
|
||||
library(pheatmap)
|
||||
library(ggplot2)
|
||||
library(gridExtra)
|
||||
|
||||
###############################################################################
|
||||
|
||||
@ -26,21 +22,13 @@ library(gridExtra)
|
||||
# p: the where in 'i' and 'x' a new column starts
|
||||
# Dimnames: The names of the rows and columns
|
||||
remove.background = function(geneCellMatrix, ambientRNAprofile){
|
||||
# do some input checks first
|
||||
if(!("dgCMatrix" %in% class(geneCellMatrix))){
|
||||
cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
|
||||
stop()
|
||||
}
|
||||
|
||||
|
||||
# Here is the actual functionality
|
||||
for(gene in names(ambientRNAprofile[ambientRNAprofile > 0])){
|
||||
for(gene in names(ambientProfile[ambientProfile > 0])){
|
||||
# Determine the locations where the gene is not zero, therefore referenced in i
|
||||
iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
|
||||
# Determine the location of the actual values,
|
||||
xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
|
||||
# Remove the contaminating RNA
|
||||
geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientRNAprofile[gene]
|
||||
geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene]
|
||||
}
|
||||
# correct for instances where the expression was corrected to below zero
|
||||
geneCellMatrix@x[geneCellMatrix@x < 0] = 0
|
||||
@ -49,16 +37,17 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
|
||||
}
|
||||
##############
|
||||
|
||||
determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
|
||||
determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
|
||||
|
||||
# determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
|
||||
backGroundMax = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
|
||||
backGroundMax = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
|
||||
names(backGroundMax) = rownames(fullCellMatrix)
|
||||
nCell = ncol(cellMatrix)
|
||||
|
||||
# droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
|
||||
nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
|
||||
# rowSum on a logical statement returns the number of TRUE occurences
|
||||
occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
|
||||
occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
|
||||
|
||||
#probability of a background read of a gene ending up in a cell
|
||||
probabiltyCellContaminationPerGene = occurences / nEmpty
|
||||
@ -82,127 +71,36 @@ read.full.matrix = function(fullFolderLocation){
|
||||
return(fullMatrix)
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
getExpressionThreshold = function(gene, expMat, percentile){
|
||||
orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
|
||||
CS = cumsum(orderedExpression)
|
||||
return(orderedExpression[which(CS/max(CS) > percentile)[1]])
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
celltypeSpecificityScore = function(gene, expMat){
|
||||
CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
|
||||
return((sum(CS/max(CS))/ncol(expMat)) )
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
|
||||
|
||||
# Make somewhere to store all the data that needs to be returned to the user
|
||||
ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
|
||||
|
||||
# do a quick first run to see which genes get corrected at the highest setting
|
||||
ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
|
||||
genelist = names(ambientProfile[ambientProfile > 0])
|
||||
|
||||
print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
|
||||
ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
|
||||
names(ctsScores) = rownames(cellExpression)
|
||||
|
||||
for(gene in genelist){
|
||||
ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
|
||||
}
|
||||
|
||||
# loop over every threshold to test
|
||||
# Starts at the highest value so
|
||||
for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
|
||||
ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
|
||||
|
||||
print(paste0("Profiling at cutoff ", emptyDropletCutoff))
|
||||
|
||||
ambientScoreProfile = data.frame(ambientProfile, ctsScores)
|
||||
#ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
|
||||
ambientScoreProfile$stillOverAmbient = 0
|
||||
ambientScoreProfile$belowCellexpression = 0
|
||||
|
||||
genesToCheck = names(ambientProfile[ambientProfile > 0])
|
||||
if(exists("overAmbientGenes")){
|
||||
genesToCheck = overAmbientGenes
|
||||
}
|
||||
|
||||
print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
|
||||
|
||||
for(gene in genesToCheck){
|
||||
expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
|
||||
|
||||
if(emptyDropletCutoff == startPos){
|
||||
ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0 & cellExpression[gene,] < expThresh)["TRUE"]
|
||||
}
|
||||
ambientScoreProfile[gene, "stillOverAmbient"] = table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"] & cellExpression[gene,] < expThresh)["TRUE"]
|
||||
}
|
||||
|
||||
ambientScoreProfile[is.na(ambientScoreProfile)] = 0
|
||||
ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
|
||||
ambientScoreProfile[is.na(ambientScoreProfile)] = 0
|
||||
|
||||
# Genes that have already been completely removed don't need to be checked at higher resolution
|
||||
overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
|
||||
|
||||
ambientScoreProfile[genelist,"AmbientCorrection"]
|
||||
|
||||
ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
|
||||
if(emptyDropletCutoff == startPos){
|
||||
ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
|
||||
}
|
||||
ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
|
||||
ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
|
||||
}
|
||||
ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
|
||||
ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
|
||||
|
||||
ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
|
||||
ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
|
||||
|
||||
ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
|
||||
|
||||
return(ambientScoreProfileOverview)
|
||||
}
|
||||
|
||||
|
||||
##############
|
||||
# Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
|
||||
# TODO
|
||||
# Commented out until I find a fix
|
||||
|
||||
# write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
|
||||
# dir.create(folderLocation)
|
||||
#
|
||||
# writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
|
||||
#
|
||||
# # save genes and cells names
|
||||
# write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
|
||||
# write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
|
||||
#
|
||||
# correctedSignal = correctedSignal[correctedSignal > 0]
|
||||
# write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
|
||||
#
|
||||
# }
|
||||
write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
|
||||
dir.create(folderLocation)
|
||||
|
||||
writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
|
||||
|
||||
# save genes and cells names
|
||||
write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
|
||||
write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
|
||||
|
||||
correctedSignal = correctedSignal[correctedSignal > 0]
|
||||
write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
|
||||
|
||||
}
|
||||
|
||||
# describe the number of genes identified in the background
|
||||
# and the number of genes failing the contamination chance threshold
|
||||
# and the number of genes failing the contaminiation chance threshold
|
||||
#
|
||||
describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
|
||||
cutoffValue = seq(start, stop, by)
|
||||
genesInBackground = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
||||
genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
||||
nEmptyDroplets = vector(mode = "numeric", length = length(seq(start, stop, by)))
|
||||
|
||||
ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating, cutoffValue)
|
||||
ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
|
||||
rownames(ambientDescriptions) = seq(start, stop, by)
|
||||
for(emptyCutoff in seq(start, stop, by)){
|
||||
nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
|
||||
|
||||
occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
|
||||
occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
|
||||
|
||||
#probability of a background read of a gene ending up in a cell
|
||||
probabiltyCellContaminationPerGene = occurences / nEmpty
|
||||
@ -215,39 +113,23 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
|
||||
}
|
||||
|
||||
|
||||
plot.correction.effect.chance = function(correctionProfile){
|
||||
pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
|
||||
cluster_cols = FALSE,
|
||||
treeheight_row = 0,
|
||||
main = "Chance of affecting DE analyses")
|
||||
}
|
||||
|
||||
plot.correction.effect.removal = function(correctionProfile){
|
||||
pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
|
||||
cluster_cols = FALSE, treeheight_row = 0,
|
||||
main = "Counts removed from each cell")
|
||||
}
|
||||
|
||||
|
||||
plot.ambient.profile = function(ambientProfile){
|
||||
par(mfrow = c(3,1))
|
||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
|
||||
main = "Total number of empty droplets at cutoffs",
|
||||
xlab = "empty droplet UMI cutoff",
|
||||
ylab = "Number of empty droplets")
|
||||
|
||||
p1 = ggplot(ambientProfile, aes(x=cutoffValue, y=genesInBackground)) + geom_point()
|
||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
|
||||
main = "Number of genes in ambient RNA",
|
||||
xlab = "empty droplet UMI cutoff",
|
||||
ylab = "Genes in empty droplets")
|
||||
|
||||
|
||||
p2= ggplot(ambientProfile, aes(x=cutoffValue, y=genesContaminating)) + geom_point()
|
||||
|
||||
p3 = ggplot(ambientProfile, aes(x=cutoffValue, y=nEmptyDroplets)) + geom_point()
|
||||
|
||||
grid.arrange(p1, p2, p3, nrow = 3)
|
||||
|
||||
}
|
||||
|
||||
# I noticed that the number of genes removed tends to even out over time
|
||||
# Test whether the point where this first happens is a good empty cutoff point
|
||||
recommend.empty.cutoff = function(ambientProfile){
|
||||
highestNumberOfGenes = max(ambientProfile[,3])
|
||||
firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
|
||||
return(as.numeric(rownames(ambientProfile[firstOccurence,])))
|
||||
plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
|
||||
main = "number of genes to correct",
|
||||
xlab = "empty droplet UMI cutoff",
|
||||
ylab = "Genes identified as contamination")
|
||||
}
|
||||
|
||||
|
||||
@ -259,3 +141,5 @@ recommend.empty.cutoff = function(ambientProfile){
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
132
README.md
132
README.md
@ -2,10 +2,21 @@
|
||||
|
||||
FastCAR is an R package to remove ambient RNA from cells in droplet based single cell RNA sequencing data.
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Installation
|
||||
These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
|
||||
|
||||
FastCAR can be installed from git with the following command.
|
||||
### Prerequisites
|
||||
|
||||
What things you need to install the software and how to install them
|
||||
|
||||
```
|
||||
Give examples
|
||||
```
|
||||
|
||||
### Installing
|
||||
|
||||
FastCAR can be install from git with the following command.
|
||||
|
||||
```
|
||||
devtools::install_git("https://git.web.rug.nl/P278949/FastCAR")
|
||||
@ -18,123 +29,76 @@ First load the library and dependencies.
|
||||
library(Matrix)
|
||||
library(Seurat)
|
||||
library(qlcMatrix)
|
||||
library(pheatmap)
|
||||
library(ggplot2)
|
||||
library(gridExtra)
|
||||
```
|
||||
Specify the locations of the expression matrices
|
||||
library(FastCAR)
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
cellExpressionFolder = c("Cellranger_output/sample1/filtered_feature_bc_matrix/")
|
||||
fullMatrixFolder = c("Cellranger_output/sample1/raw_feature_bc_matrix/")
|
||||
|
||||
```
|
||||
Load both the cell matrix and the full matrix
|
||||
|
||||
|
||||
|
||||
```
|
||||
# This folder will contain the corrected cell matrix
|
||||
correctedMatrixFolder = c("Cellranger_output/sample1/corrected_feature_bc_matrix")
|
||||
|
||||
|
||||
cellMatrix = read.cell.matrix(cellExpressionFolder)
|
||||
fullMatrix = read.full.matrix(fullMatrixFolder)
|
||||
```
|
||||
The following functions give an idea of the effect that different settings have on the ambient RNA profile.
|
||||
These are optional as they do take a few minutes and the default settings work fine
|
||||
Plotting the number of empty droplets, the number of genes identified in the ambient RNA, and the number of genes that will be corrected for at different UMI cutoffs,
|
||||
|
||||
```
|
||||
ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix,
|
||||
start = 10,
|
||||
stop = 500,
|
||||
by = 10,
|
||||
contaminationChanceCutoff = 0.05)
|
||||
|
||||
|
||||
The following functions give an idea of the effect that different settings have on the ambient RNA profile
|
||||
|
||||
```
|
||||
ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
|
||||
plot.ambient.profile(ambProfile)
|
||||
```
|
||||

|
||||
|
||||
|
||||
The actual effect on the chances of genes affecting your DE analyses can be determined and visualized with the following function
|
||||
|
||||
```
|
||||
|
||||
correctionEffectProfile = describe.correction.effect(allExpression, cellExpression, 50, 500, 10, 0.05)
|
||||
|
||||
plot.correction.effect.chance(correctionEffectProfile)
|
||||
|
||||
```
|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
|
||||
|
||||
How many reads will be removed of these genes can be visualized from the same profile
|
||||
```
|
||||
|
||||
plot.correction.effect.removal(correctionEffectProfile)
|
||||
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
Set the empty droplet cutoff and the contamination chance cutoff
|
||||
|
||||
The empty droplet cutoff is the number of UMIs a droplet can contain at the most to be considered empty.
|
||||
100 works fine but we tested this method in only one tissue. For other tissues these settings may need to be changed.
|
||||
Increasing this number also increases the highest possible value of expression of a given gene.
|
||||
As the correction will remove this value from every cell it is adviced not to set this too high and thereby overcorrect the expression in lowly expressing cells.
|
||||
|
||||
The contamination chance cutoff is the allowed probability of a gene contaminating a cell.
|
||||
As we developed FastCAR specifically for differential expression analyses between groups we suggest setting this such that not enough cells could be contaminated to affect this.
|
||||
In a cluster of a thousand cells divided into two groups there would be 2-3 cells per group with ambient RNA contamination of any given gene.
|
||||
Such low cell numbers are disregarded for differential expression analyses.
|
||||
|
||||
There is an experimental function that gives a recommendation based on the ambient profiling results.
|
||||
This selects the first instance of the maximum number of genes being corrected for.
|
||||
I have no idea yet if this is actually a good idea.
|
||||
Set
|
||||
|
||||
```
|
||||
emptyDropletCutoff = recommend.empty.cutoff(ambProfile)
|
||||
```
|
||||
|
||||
emptyDropletCutoff = 100
|
||||
contaminationChanceCutoff = 0.05
|
||||
|
||||
```
|
||||
emptyDropletCutoff = 150
|
||||
contaminationChanceCutoff = 0.005
|
||||
```
|
||||
|
||||
Determine the ambient RNA profile and remove the ambient RNA from each cell
|
||||
```
|
||||
ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
|
||||
cellMatrix = remove.background(cellMatrix, ambientProfile)
|
||||
```
|
||||
|
||||
This corrected matrix can be used to to make a Seurat object
|
||||
|
||||
```
|
||||
seuratObject = CreateSeuratObject(cellMatrix)
|
||||
|
||||
Finally write the corrected cell/gene matrix to a file, this matrix can be used in Seurat the same way as any other cell/gene matrix.
|
||||
|
||||
```
|
||||
|
||||
write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
|
||||
|
||||
```
|
||||
|
||||
End with an example of getting some data out of the system or using it for a little demo
|
||||
|
||||
## Running the tests
|
||||
|
||||
|
||||
## Authors
|
||||
|
||||
* **Marijn Berg** - m.berg@umcg.nl
|
||||
* **Marijn Berg** - *Initial work*
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the GPL-3 License - see the [LICENSE.md](LICENSE.md) file for details
|
||||
|
||||
## Changelog
|
||||
|
||||
### v0.1
|
||||
First fully working version of the R package
|
||||
|
||||
### v0.2
|
||||
Fixed function to write the corrected matrix to file.
|
||||
Added readout of which genes will be corrected for and how many reads will be removed per cell
|
||||
Added some input checks to functions
|
||||
|
||||
### v0.2
|
||||
Fixed a bug that caused FastCAR to be incompatible with biobase libraries
|
||||
Added better profiling to determine the effect of different settings on the corrections
|
||||
Swapped base R plots for ggplot2 plots
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user