Added changes to readme

Swapped base R plots for ggplot
Fixed bug that caused incompatibility with biobase due to having the same function name.
2021-11-01 16:41:25 +01:00 · 2021-11-01 14:42:49 +01:00 · 2021-11-01 14:25:43 +01:00 · 2021-10-05 15:00:42 +02:00
6 changed files with 160 additions and 21 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
--- a/Images/Counts_removed.png
+++ b/Images/Counts_removed.png
--- a/Images/DE_affect_chance.png
+++ b/Images/DE_affect_chance.png
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@@ -13,6 +13,9 @@
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
+library(pheatmap)
+library(ggplot2)
+library(gridExtra)

 ###############################################################################

@@ -46,12 +49,11 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
 }
 ##############

-determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){

  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
-  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  backGroundMax   = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)
-  nCell = ncol(cellMatrix)

  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
@@ -80,6 +82,93 @@ read.full.matrix = function(fullFolderLocation){
  return(fullMatrix)
 }

+###############################################################################
+getExpressionThreshold = function(gene, expMat, percentile){
+  orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
+  CS = cumsum(orderedExpression)
+  return(orderedExpression[which(CS/max(CS) > percentile)[1]])
+}
+
+###############################################################################
+celltypeSpecificityScore = function(gene, expMat){
+  CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
+  return((sum(CS/max(CS))/ncol(expMat))  )
+}
+
+###############################################################################
+describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
+
+  # Make somewhere to store all the data that needs to be returned to the user
+  ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
+
+  # do a quick first run to see which genes get corrected at the highest setting
+  ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
+  genelist = names(ambientProfile[ambientProfile > 0])
+
+  print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
+  ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
+  names(ctsScores) = rownames(cellExpression)
+
+  for(gene in genelist){
+    ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
+  }
+
+  # loop over every threshold to test
+  # Starts at the highest value so
+  for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
+    ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
+
+    print(paste0("Profiling at cutoff ", emptyDropletCutoff))
+
+    ambientScoreProfile = data.frame(ambientProfile, ctsScores)
+    #ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
+    ambientScoreProfile$stillOverAmbient = 0
+    ambientScoreProfile$belowCellexpression = 0
+
+    genesToCheck = names(ambientProfile[ambientProfile > 0])
+    if(exists("overAmbientGenes")){
+      genesToCheck = overAmbientGenes
+    }
+
+    print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
+
+    for(gene in genesToCheck){
+      expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
+
+      if(emptyDropletCutoff == startPos){
+        ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0  & cellExpression[gene,] < expThresh)["TRUE"]
+      }
+      ambientScoreProfile[gene, "stillOverAmbient"] =  table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"]  & cellExpression[gene,] < expThresh)["TRUE"]
+    }
+
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+    ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+
+    # Genes that have already been completely removed don't need to be checked at higher resolution
+    overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
+
+    ambientScoreProfile[genelist,"AmbientCorrection"]
+
+    ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
+    if(emptyDropletCutoff == startPos){
+      ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
+    }
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
+  }
+  ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+
+  return(ambientScoreProfileOverview)
+}
+
+
 ##############
 # Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
 # TODO
@@ -100,14 +189,15 @@ read.full.matrix = function(fullFolderLocation){
 # }

 # describe the number of genes identified in the background
-# and the number of genes failing the contaminiation chance threshold
+# and the number of genes failing the contamination chance threshold
 #
 describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
+  cutoffValue = seq(start, stop, by)
  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))

-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating, cutoffValue)
  rownames(ambientDescriptions) = seq(start, stop, by)
  for(emptyCutoff in seq(start, stop, by)){
    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
@@ -125,23 +215,31 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
 }


+plot.correction.effect.chance = function(correctionProfile){
+  pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
+           cluster_cols = FALSE,
+           treeheight_row = 0,
+           main = "Chance of affecting DE analyses")
+}
+
+plot.correction.effect.removal = function(correctionProfile){
+  pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
+           cluster_cols = FALSE, treeheight_row = 0,
+           main = "Counts removed from each cell")
+}
+

 plot.ambient.profile = function(ambientProfile){
-  par(mfrow = c(3,1))
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
-       main = "Total number of empty droplets at cutoffs",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Number of empty droplets")

-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
-       main = "Number of genes in ambient RNA",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes in empty droplets")
+  p1 = ggplot(ambientProfile, aes(x=cutoffValue, y=genesInBackground)) + geom_point()
+
+
+  p2= ggplot(ambientProfile, aes(x=cutoffValue, y=genesContaminating)) + geom_point()
+
+  p3 = ggplot(ambientProfile, aes(x=cutoffValue, y=nEmptyDroplets)) + geom_point()
+
+  grid.arrange(p1, p2, p3, nrow = 3)

-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
-       main = "number of genes to correct",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes identified as contamination")
 }

 # I noticed that the number of genes removed tends to even out over time
--- a/README.md
+++ b/README.md
@@ -18,7 +18,9 @@ First load the library and dependencies.
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
-library(FastCAR)
+library(pheatmap)
+library(ggplot2)
+library(gridExtra)
 ```
 Specify the locations of the expression matrices

@@ -47,6 +49,31 @@ plot.ambient.profile(ambProfile)
 ![picture](Images/Example_profile.png)


+The actual effect on the chances of genes affecting your DE analyses can be determined and visualized with the following function
+
+``` 
+  
+  correctionEffectProfile = describe.correction.effect(allExpression, cellExpression, 50, 500, 10, 0.05)
+  
+  plot.correction.effect.chance(correctionEffectProfile)
+  
+```
+
+![picture](Images/DE_affect_chance.png)
+
+
+
+
+How many reads will be removed of these genes can be visualized from the same profile
+```
+
+  plot.correction.effect.removal(correctionEffectProfile)
+
+``` 
+
+![picture](Images/Counts_removed.png)
+
+
 Set the empty droplet cutoff and the contamination chance cutoff

 The empty droplet cutoff is the number of UMIs a droplet can contain at the most to be considered empty.
@@ -69,8 +96,8 @@ emptyDropletCutoff = recommend.empty.cutoff(ambProfile)


 ```
-emptyDropletCutoff        = 100 
-contaminationChanceCutoff = 0.05
+emptyDropletCutoff        = 150 
+contaminationChanceCutoff = 0.005
 ```

 Determine the ambient RNA profile and remove the ambient RNA from each cell
@@ -103,3 +130,11 @@ First fully working version of the R package
 Fixed function to write the corrected matrix to file.
 Added readout of which genes will be corrected for and how many reads will be removed per cell
 Added some input checks to functions
+
+### v0.2
+Fixed a bug that caused FastCAR to be incompatible with biobase libraries
+Added better profiling to determine the effect of different settings on the corrections
+Swapped base R plots for ggplot2 plots
+
+
+
Author	SHA1	Message	Date
MarijnBerg	42b2d340c5	Added changes to readme	2021-11-01 16:41:25 +01:00
MarijnBerg	55b27c57a6	Swapped base R plots for ggplot	2021-11-01 14:42:49 +01:00
MarijnBerg	a620b4e5d2	Fixed bug that caused incompatibility with biobase due to having the same function name. Added new profiling functions.	2021-11-01 14:25:43 +01:00
MarijnBerg	0427dcc6ad	Removed uniused variable	2021-10-05 15:00:42 +02:00