Added changes to readme

Swapped base R plots for ggplot
Fixed bug that caused incompatibility with biobase due to having the same function name.
2021-11-01 16:41:25 +01:00 · 2021-11-01 14:42:49 +01:00 · 2021-11-01 14:25:43 +01:00 · 2021-10-05 15:00:42 +02:00 · 2020-06-08 10:15:27 +02:00 · 2020-04-27 15:50:48 +02:00
10 changed files with 320 additions and 41 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -0,0 +1,2 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 .Rproj.user
 .Rhistory
 .RData
 .Ruserdata
--- a/1
+++ b/1
--- a/FastCAR.Rproj
+++ b/FastCAR.Rproj
@ -0,0 +1,20 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
--- a/Images/Counts_removed.png
+++ b/Images/Counts_removed.png
--- a/Images/DE_affect_chance.png
+++ b/Images/DE_affect_chance.png
--- a/Images/Example_profile.png
+++ b/Images/Example_profile.png
--- a/1
+++ b/1
@ -0,0 +1 @@
 exportPattern("^[[:alpha:]]+")
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@ -7,11 +7,15 @@
 # empty droplets and removing per gene the highest value found in the ambient
 # RNA if it's found to contaminate more than a certain threshold of droplets
 ###############################################################################
-# This file contains base functions
+# TODO
 #
 ###############################################################################
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
 library(pheatmap)
 library(ggplot2)
 library(gridExtra)
 ###############################################################################
@ -22,13 +26,21 @@ library(qlcMatrix)
 # p: the where in 'i' and 'x' a new column starts
 # Dimnames: The names of the rows and columns
 remove.background = function(geneCellMatrix, ambientRNAprofile){
-  for(gene in names(ambientProfile[ambientProfile > 0])){
+  # do some input checks first
  if(!("dgCMatrix" %in% class(geneCellMatrix))){
    cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
    stop()
  }
  # Here is the actual functionality
  for(gene in names(ambientRNAprofile[ambientRNAprofile > 0])){
    # Determine the locations where the gene is not zero, therefore referenced in i
    iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
    # Determine the location of the actual values,
    xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
    # Remove the contaminating RNA
-    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene]
+    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientRNAprofile[gene]
  }
  # correct for instances where the expression was corrected to below zero
  geneCellMatrix@x[geneCellMatrix@x < 0] = 0
@ -37,17 +49,16 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
 }
 ##############
-determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
-  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  backGroundMax   = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)
  nCell = ncol(cellMatrix)
  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
  # rowSum on a logical statement returns the number of TRUE occurences
-  occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
+  occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
  #probability of a background read of a gene ending up in a cell
  probabiltyCellContaminationPerGene = occurences / nEmpty
@ -71,36 +82,127 @@ read.full.matrix = function(fullFolderLocation){
  return(fullMatrix)
 }
-##############
+###############################################################################
-
+getExpressionThreshold = function(gene, expMat, percentile){
-write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
+  orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
-  dir.create(folderLocation)
+  CS = cumsum(orderedExpression)
-
+  return(orderedExpression[which(CS/max(CS) > percentile)[1]])
  writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
  # save genes and cells names
  write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
  write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
  correctedSignal = correctedSignal[correctedSignal > 0]
  write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
 }
 ###############################################################################
 celltypeSpecificityScore = function(gene, expMat){
  CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
  return((sum(CS/max(CS))/ncol(expMat))  )
 }
 ###############################################################################
 describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
  # Make somewhere to store all the data that needs to be returned to the user
  ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
  # do a quick first run to see which genes get corrected at the highest setting
  ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
  genelist = names(ambientProfile[ambientProfile > 0])
  print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
  ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
  names(ctsScores) = rownames(cellExpression)
  for(gene in genelist){
    ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
  }
  # loop over every threshold to test
  # Starts at the highest value so
  for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
    ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
    print(paste0("Profiling at cutoff ", emptyDropletCutoff))
    ambientScoreProfile = data.frame(ambientProfile, ctsScores)
    #ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
    ambientScoreProfile$stillOverAmbient = 0
    ambientScoreProfile$belowCellexpression = 0
    genesToCheck = names(ambientProfile[ambientProfile > 0])
    if(exists("overAmbientGenes")){
      genesToCheck = overAmbientGenes
    }
    print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
    for(gene in genesToCheck){
      expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
      if(emptyDropletCutoff == startPos){
        ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0  & cellExpression[gene,] < expThresh)["TRUE"]
      }
      ambientScoreProfile[gene, "stillOverAmbient"] =  table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"]  & cellExpression[gene,] < expThresh)["TRUE"]
    }
    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
    ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
    # Genes that have already been completely removed don't need to be checked at higher resolution
    overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
    ambientScoreProfile[genelist,"AmbientCorrection"]
    ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
    if(emptyDropletCutoff == startPos){
      ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
    }
    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
  }
  ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
  ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
  ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
  return(ambientScoreProfileOverview)
 }
 ##############
 # Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
 # TODO
 # Commented out until I find a fix
 # write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
 #   dir.create(folderLocation)
 #
 #   writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
 #
 #   # save genes and cells names
 #   write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
 #   write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
 #
 #   correctedSignal = correctedSignal[correctedSignal > 0]
 #   write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
 #
 # }
 # describe the number of genes identified in the background
-# and the number of genes failing the contaminiation chance threshold
+# and the number of genes failing the contamination chance threshold
 #
 describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
  cutoffValue = seq(start, stop, by)
  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))
-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating, cutoffValue)
  rownames(ambientDescriptions) = seq(start, stop, by)
  for(emptyCutoff in seq(start, stop, by)){
    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
-    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
+    occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
    #probability of a background read of a gene ending up in a cell
    probabiltyCellContaminationPerGene = occurences / nEmpty
@ -113,23 +215,39 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
 }
 plot.correction.effect.chance = function(correctionProfile){
  pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
           cluster_cols = FALSE,
           treeheight_row = 0,
           main = "Chance of affecting DE analyses")
 }
 plot.correction.effect.removal = function(correctionProfile){
  pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
           cluster_cols = FALSE, treeheight_row = 0,
           main = "Counts removed from each cell")
 }
 plot.ambient.profile = function(ambientProfile){
  par(mfrow = c(3,1))
  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
       main = "Total number of empty droplets at cutoffs",
       xlab = "empty droplet UMI cutoff",
       ylab = "Number of empty droplets")
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
+  p1 = ggplot(ambientProfile, aes(x=cutoffValue, y=genesInBackground)) + geom_point()
       main = "Number of genes in ambient RNA",
       xlab = "empty droplet UMI cutoff",
       ylab = "Genes in empty droplets")
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
+
-       main = "number of genes to correct",
+  p2= ggplot(ambientProfile, aes(x=cutoffValue, y=genesContaminating)) + geom_point()
-       xlab = "empty droplet UMI cutoff",
+
-       ylab = "Genes identified as contamination")
+  p3 = ggplot(ambientProfile, aes(x=cutoffValue, y=nEmptyDroplets)) + geom_point()
  grid.arrange(p1, p2, p3, nrow = 3)
 }
 # I noticed that the number of genes removed tends to even out over time
 # Test whether the point where this first happens is a good empty cutoff point
 recommend.empty.cutoff = function(ambientProfile){
  highestNumberOfGenes = max(ambientProfile[,3])
  firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
  return(as.numeric(rownames(ambientProfile[firstOccurence,])))
 }
@ -141,5 +259,3 @@ plot.ambient.profile = function(ambientProfile){
--- a/README.md
+++ b/README.md
@ -1,3 +1,140 @@
 # FastCAR
-Repo for the FastCAR (Fast Correction of Ambient RNA) R package and maybe  eventual python library
+FastCAR is an R package to remove ambient RNA from cells in droplet based single cell RNA sequencing data.
 ### Installation
 FastCAR can be installed from git with the following command.
 ```
 devtools::install_git("https://git.web.rug.nl/P278949/FastCAR")
 ```
 Running FastCAR is quite simple.
 First load the library and dependencies.
 ```
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
 library(pheatmap)
 library(ggplot2)
 library(gridExtra)
 ```
 Specify the locations of the expression matrices
 ```
 cellExpressionFolder  = c("Cellranger_output/sample1/filtered_feature_bc_matrix/")
 fullMatrixFolder      = c("Cellranger_output/sample1/raw_feature_bc_matrix/")
 ```
 Load both the cell matrix and the full matrix
 ```
 cellMatrix     = read.cell.matrix(cellExpressionFolder)
 fullMatrix     = read.full.matrix(fullMatrixFolder)
 ```
 The following functions give an idea of the effect that different settings have on the ambient RNA profile. 
 These are optional as they do take a few minutes and the default settings work fine
 Plotting the number of empty droplets, the number of genes identified in the ambient RNA, and the number of genes that will be corrected for at different UMI cutoffs,
 ```
 ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, 
                                           start = 10, 
                                           stop = 500, 
                                           by = 10, 
                                           contaminationChanceCutoff = 0.05)
 plot.ambient.profile(ambProfile)
 ``` 
 ![picture](Images/Example_profile.png)
 The actual effect on the chances of genes affecting your DE analyses can be determined and visualized with the following function
 ``` 
  correctionEffectProfile = describe.correction.effect(allExpression, cellExpression, 50, 500, 10, 0.05)
  plot.correction.effect.chance(correctionEffectProfile)
 ```
 ![picture](Images/DE_affect_chance.png)
 How many reads will be removed of these genes can be visualized from the same profile
 ```
  plot.correction.effect.removal(correctionEffectProfile)
 ``` 
 ![picture](Images/Counts_removed.png)
 Set the empty droplet cutoff and the contamination chance cutoff
 The empty droplet cutoff is the number of UMIs a droplet can contain at the most to be considered empty.
 100 works fine but we tested this method in only one tissue. For other tissues these settings may need to be changed.
 Increasing this number also increases the highest possible value of expression of a given gene.
 As the correction will remove this value from every cell it is adviced not to set this too high and thereby overcorrect the expression in lowly expressing cells.
 The contamination chance cutoff is the allowed probability of a gene contaminating a cell. 
 As we developed FastCAR specifically for differential expression analyses between groups we suggest setting this such that not enough cells could be contaminated to affect this.
 In a cluster of a thousand cells divided into two groups there would be 2-3 cells per group with ambient RNA contamination of any given gene.
 Such low cell numbers are disregarded for differential expression analyses.
 There is an experimental function that gives a recommendation based on the ambient profiling results.
 This selects the first instance of the maximum number of genes being corrected for.
 I have no idea yet if this is actually a good idea.
 ```
 emptyDropletCutoff = recommend.empty.cutoff(ambProfile)
 ```
 ```
 emptyDropletCutoff        = 150 
 contaminationChanceCutoff = 0.005
 ```
 Determine the ambient RNA profile and remove the ambient RNA from each cell
 ```
 ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
 cellMatrix     = remove.background(cellMatrix, ambientProfile)
 ```
 This corrected matrix can be used to to make a Seurat object
 ```
 seuratObject = CreateSeuratObject(cellMatrix) 
 ```
 ## Authors
 * **Marijn Berg** - m.berg@umcg.nl
 ## License
 This project is licensed under the GPL-3 License - see the [LICENSE.md](LICENSE.md) file for details
 ## Changelog
 ### v0.1
 First fully working version of the R package
 ### v0.2
 Fixed function to write the corrected matrix to file.
 Added readout of which genes will be corrected for and how many reads will be removed per cell
 Added some input checks to functions
 ### v0.2
 Fixed a bug that caused FastCAR to be incompatible with biobase libraries
 Added better profiling to determine the effect of different settings on the corrections
 Swapped base R plots for ggplot2 plots
Author	SHA1	Message	Date
MarijnBerg	42b2d340c5	Added changes to readme	2021-11-01 16:41:25 +01:00
MarijnBerg	55b27c57a6	Swapped base R plots for ggplot	2021-11-01 14:42:49 +01:00
MarijnBerg	a620b4e5d2	Fixed bug that caused incompatibility with biobase due to having the same function name. Added new profiling functions.	2021-11-01 14:25:43 +01:00
MarijnBerg	0427dcc6ad	Removed uniused variable	2021-10-05 15:00:42 +02:00
Marijn	f4b6748210	Fixed variable name issue in remove.background function	2020-06-08 10:15:27 +02:00
Marijn	40765c3a42	updated readme and made the new function match the rest of the formatting	2020-04-27 15:50:48 +02:00
Marijn	c457e087be	Added recommended empty cutoff function	2020-04-27 13:40:04 +02:00
Marijn	48dc9d81e6	Fixed Readme	2020-03-26 15:19:45 +01:00
Marijn	92cb2a655b	Fixed a bug where base rowSums were used instead of Matrix rowSums	2020-03-26 15:10:24 +01:00
Marijn	c7a8c6a881	Updated Readme	2020-03-26 14:12:27 +01:00
Marijn	a47d8aa4c9	Updated Readme	2020-03-26 14:09:20 +01:00
Marijn	7b7fe13c75	Updated Readme	2020-03-26 12:08:24 +01:00
Marijn	e7ac549311	Updated Readme	2020-03-26 12:05:04 +01:00
Marijn	7a2607a091	Improved Readme	2020-03-26 11:51:08 +01:00