Fixed bug that caused incompatibility with biobase due to having the same function name.

Added new profiling functions.
2021-11-01 14:25:43 +01:00
parent 0427dcc6ad
commit a620b4e5d2
1 changed files with 108 additions and 6 deletions
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@@ -13,6 +13,8 @@
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
+library(pheatmap)
+library(ggplot2)

 ###############################################################################

@@ -46,10 +48,10 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
 }
 ##############

-determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){

  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
-  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  backGroundMax   = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)

  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
@@ -79,6 +81,93 @@ read.full.matrix = function(fullFolderLocation){
  return(fullMatrix)
 }

+###############################################################################
+getExpressionThreshold = function(gene, expMat, percentile){
+  orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
+  CS = cumsum(orderedExpression)
+  return(orderedExpression[which(CS/max(CS) > percentile)[1]])
+}
+
+###############################################################################
+celltypeSpecificityScore = function(gene, expMat){
+  CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
+  return((sum(CS/max(CS))/ncol(expMat))  )
+}
+
+###############################################################################
+describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
+
+  # Make somewhere to store all the data that needs to be returned to the user
+  ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
+
+  # do a quick first run to see which genes get corrected at the highest setting
+  ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
+  genelist = names(ambientProfile[ambientProfile > 0])
+
+  print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
+  ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
+  names(ctsScores) = rownames(cellExpression)
+
+  for(gene in genelist){
+    ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
+  }
+
+  # loop over every threshold to test
+  # Starts at the highest value so
+  for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
+    ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
+
+    print(paste0("Profiling at cutoff ", emptyDropletCutoff))
+
+    ambientScoreProfile = data.frame(ambientProfile, ctsScores)
+    #ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
+    ambientScoreProfile$stillOverAmbient = 0
+    ambientScoreProfile$belowCellexpression = 0
+
+    genesToCheck = names(ambientProfile[ambientProfile > 0])
+    if(exists("overAmbientGenes")){
+      genesToCheck = overAmbientGenes
+    }
+
+    print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
+
+    for(gene in genesToCheck){
+      expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
+
+      if(emptyDropletCutoff == startPos){
+        ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0  & cellExpression[gene,] < expThresh)["TRUE"]
+      }
+      ambientScoreProfile[gene, "stillOverAmbient"] =  table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"]  & cellExpression[gene,] < expThresh)["TRUE"]
+    }
+
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+    ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+
+    # Genes that have already been completely removed don't need to be checked at higher resolution
+    overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
+
+    ambientScoreProfile[genelist,"AmbientCorrection"]
+
+    ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
+    if(emptyDropletCutoff == startPos){
+      ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
+    }
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
+  }
+  ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+
+  return(ambientScoreProfileOverview)
+}
+
+
 ##############
 # Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
 # TODO
@@ -123,14 +212,27 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
  return(ambientDescriptions)
 }

+plot.correction.effect.chance = function(correctionProfile){
+  pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
+           cluster_cols = FALSE,
+           treeheight_row = 0,
+           main = "Chance of affecting DE analyses")
+}
+
+plot.correction.effect.removal = function(correctionProfile){
+  pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
+           cluster_cols = FALSE, treeheight_row = 0,
+           main = "Counts removed from each cell")
+}


 plot.ambient.profile = function(ambientProfile){
  par(mfrow = c(3,1))
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
-       main = "Total number of empty droplets at cutoffs",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Number of empty droplets")
+  ggplot(ambientProfile, aes(x=wt, y=genesInBackground)) + geom_point()
+
+  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))

  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
       main = "Number of genes in ambient RNA",