From c457e087be46139de0c6b6cbcd2a85d91542155d Mon Sep 17 00:00:00 2001
From: Marijn <mbergwork@gmail.com>
Date: Mon, 27 Apr 2020 13:40:04 +0200
Subject: [PATCH] Added recommended empty cutoff function

---
 R/FastCAR_Base.R | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/R/FastCAR_Base.R b/R/FastCAR_Base.R
index 272b8e7..c380cb3 100644
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@@ -7,7 +7,8 @@
 # empty droplets and removing per gene the highest value found in the ambient
 # RNA if it's found to contaminate more than a certain threshold of droplets
 ###############################################################################
-# This file contains base functions
+# TODO
+#
 ###############################################################################
 library(Matrix)
 library(Seurat)
@@ -22,6 +23,14 @@ library(qlcMatrix)
 # p: the where in 'i' and 'x' a new column starts
 # Dimnames: The names of the rows and columns
 remove.background = function(geneCellMatrix, ambientRNAprofile){
+  # do some input checks first
+  if(!("dgCMatrix" %in% class(geneCellMatrix))){
+    cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
+    stop()
+  }
+
+
+  # Here is the actual functionality
   for(gene in names(ambientProfile[ambientProfile > 0])){
     # Determine the locations where the gene is not zero, therefore referenced in i
     iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
@@ -72,20 +81,23 @@ read.full.matrix = function(fullFolderLocation){
 }
 
 ##############
+# Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
+# TODO
+# Commented out until I find a fix
 
-write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
-  dir.create(folderLocation)
-
-  writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
-
-  # save genes and cells names
-  write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
-  write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
-
-  correctedSignal = correctedSignal[correctedSignal > 0]
-  write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
-
-}
+# write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
+#   dir.create(folderLocation)
+#
+#   writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
+#
+#   # save genes and cells names
+#   write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
+#   write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
+#
+#   correctedSignal = correctedSignal[correctedSignal > 0]
+#   write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
+#
+# }
 
 # describe the number of genes identified in the background
 # and the number of genes failing the contaminiation chance threshold
@@ -132,7 +144,13 @@ plot.ambient.profile = function(ambientProfile){
        ylab = "Genes identified as contamination")
 }
 
-
+# I noticed that the number of genes removed tends to even out over time
+# Test whether the point where this first happens is a good empty cutoff point
+recommendedRemoval = function(ambientProfile){
+  highestNumberOfGenes = max(ambientProfile[,3])
+  firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
+  return(as.numeric(rownames(ambientProfile[firstOccurence,])))
+}