Added changes to readme

Swapped base R plots for ggplot
Fixed bug that caused incompatibility with biobase due to having the same function name.
2021-11-01 16:41:25 +01:00 · 2021-11-01 14:42:49 +01:00 · 2021-11-01 14:25:43 +01:00 · 2021-10-05 15:00:42 +02:00 · 2020-06-08 10:15:27 +02:00 · 2020-04-27 15:50:48 +02:00
13 changed files with 333 additions and 254 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
--- a/17
+++ b/17
@ -1,11 +1,16 @@
 Package: FastCAR
 Type: Package
-Title: What the Package Does (Title Case)
+Title: Fast Correction of Ambient RNA
 Version: 0.1.0
-Author: Who wrote it
-Maintainer: The package maintainer <yourself@somewhere.net>
-Description: More about what it does (maybe more than one line)
-    Use four spaces when indenting paragraphs within the Description.
-License: What license is it under?
+Author: Marijn Berg
+Maintainer: Marijn Berg <m.berg@umcg.nl>
+Description: FastCAR is used to correct gene expression of cells in droplet based single cell RNA
+    sequencing data. It corrects for ambient RNA, RNA originating from lysed cells in the cell suspension.
+License: GPL-3
 Encoding: UTF-8
 LazyData: true
+RoxygenNote: 7.1.0.9000
+Imports:
+    Matrix,
+    Seurat,
+    qlcMatrix
--- a/FastCAR.Rproj
+++ b/FastCAR.Rproj
@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
--- a/Images/Counts_removed.png
+++ b/Images/Counts_removed.png
--- a/Images/DE_affect_chance.png
+++ b/Images/DE_affect_chance.png
--- a/Images/Example_profile.png
+++ b/Images/Example_profile.png
--- a/1
+++ b/1
@ -0,0 +1 @@
+exportPattern("^[[:alpha:]]+")
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@ -7,10 +7,16 @@
 # empty droplets and removing per gene the highest value found in the ambient
 # RNA if it's found to contaminate more than a certain threshold of droplets
 ###############################################################################
-# This file contains base functions
+# TODO
+#
 ###############################################################################
 library(Matrix)
 library(Seurat)
+library(qlcMatrix)
+library(pheatmap)
+library(ggplot2)
+library(gridExtra)
+
 ###############################################################################

 # had to make this function to efficiently modify sparse matrices on a per gene basis
@ -20,13 +26,21 @@ library(Seurat)
 # p: the where in 'i' and 'x' a new column starts
 # Dimnames: The names of the rows and columns
 remove.background = function(geneCellMatrix, ambientRNAprofile){
-  for(gene in names(ambientProfile[ambientProfile > 0])){
+  # do some input checks first
+  if(!("dgCMatrix" %in% class(geneCellMatrix))){
+    cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
+    stop()
+  }
+
+
+  # Here is the actual functionality
+  for(gene in names(ambientRNAprofile[ambientRNAprofile > 0])){
    # Determine the locations where the gene is not zero, therefore referenced in i
    iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
    # Determine the location of the actual values,
    xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
    # Remove the contaminating RNA
-    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene]
+    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientRNAprofile[gene]
  }
  # correct for instances where the expression was corrected to below zero
  geneCellMatrix@x[geneCellMatrix@x < 0] = 0
@ -35,17 +49,16 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
 }
 ##############

-determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){

  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
-  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  backGroundMax   = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)
-  nCell = ncol(cellMatrix)

  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
  # rowSum on a logical statement returns the number of TRUE occurences
-  occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
+  occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)

  #probability of a background read of a gene ending up in a cell
  probabiltyCellContaminationPerGene = occurences / nEmpty
@ -69,36 +82,127 @@ read.full.matrix = function(fullFolderLocation){
  return(fullMatrix)
 }

-##############
-
-write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
-  dir.create(folderLocation)
-
-  writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
-
-  # save genes and cells names
-  write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
-  write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
-
-  correctedSignal = correctedSignal[correctedSignal > 0]
-  write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
-
+###############################################################################
+getExpressionThreshold = function(gene, expMat, percentile){
+  orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
+  CS = cumsum(orderedExpression)
+  return(orderedExpression[which(CS/max(CS) > percentile)[1]])
 }

+###############################################################################
+celltypeSpecificityScore = function(gene, expMat){
+  CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
+  return((sum(CS/max(CS))/ncol(expMat))  )
+}
+
+###############################################################################
+describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
+
+  # Make somewhere to store all the data that needs to be returned to the user
+  ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
+
+  # do a quick first run to see which genes get corrected at the highest setting
+  ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
+  genelist = names(ambientProfile[ambientProfile > 0])
+
+  print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
+  ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
+  names(ctsScores) = rownames(cellExpression)
+
+  for(gene in genelist){
+    ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
+  }
+
+  # loop over every threshold to test
+  # Starts at the highest value so
+  for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
+    ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
+
+    print(paste0("Profiling at cutoff ", emptyDropletCutoff))
+
+    ambientScoreProfile = data.frame(ambientProfile, ctsScores)
+    #ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
+    ambientScoreProfile$stillOverAmbient = 0
+    ambientScoreProfile$belowCellexpression = 0
+
+    genesToCheck = names(ambientProfile[ambientProfile > 0])
+    if(exists("overAmbientGenes")){
+      genesToCheck = overAmbientGenes
+    }
+
+    print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
+
+    for(gene in genesToCheck){
+      expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
+
+      if(emptyDropletCutoff == startPos){
+        ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0  & cellExpression[gene,] < expThresh)["TRUE"]
+      }
+      ambientScoreProfile[gene, "stillOverAmbient"] =  table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"]  & cellExpression[gene,] < expThresh)["TRUE"]
+    }
+
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+    ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
+    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
+
+    # Genes that have already been completely removed don't need to be checked at higher resolution
+    overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
+
+    ambientScoreProfile[genelist,"AmbientCorrection"]
+
+    ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
+    if(emptyDropletCutoff == startPos){
+      ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
+    }
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
+    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
+  }
+  ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
+
+  ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
+
+  return(ambientScoreProfileOverview)
+}
+
+
+##############
+# Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
+# TODO
+# Commented out until I find a fix
+
+# write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
+#   dir.create(folderLocation)
+#
+#   writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
+#
+#   # save genes and cells names
+#   write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
+#   write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
+#
+#   correctedSignal = correctedSignal[correctedSignal > 0]
+#   write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
+#
+# }
+
 # describe the number of genes identified in the background
-# and the number of genes failing the contaminiation chance threshold
+# and the number of genes failing the contamination chance threshold
 #
 describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
+  cutoffValue = seq(start, stop, by)
  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))

-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating, cutoffValue)
  rownames(ambientDescriptions) = seq(start, stop, by)
  for(emptyCutoff in seq(start, stop, by)){
    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]

-    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
+    occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)

    #probability of a background read of a gene ending up in a cell
    probabiltyCellContaminationPerGene = occurences / nEmpty
@ -111,23 +215,39 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
 }


+plot.correction.effect.chance = function(correctionProfile){
+  pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
+           cluster_cols = FALSE,
+           treeheight_row = 0,
+           main = "Chance of affecting DE analyses")
+}
+
+plot.correction.effect.removal = function(correctionProfile){
+  pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
+           cluster_cols = FALSE, treeheight_row = 0,
+           main = "Counts removed from each cell")
+}
+

 plot.ambient.profile = function(ambientProfile){
-  par(mfrow = c(3,1))
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
-       main = "Total number of empty droplets at cutoffs",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Number of empty droplets")

-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
-       main = "Number of genes in ambient RNA",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes in empty droplets")
+  p1 = ggplot(ambientProfile, aes(x=cutoffValue, y=genesInBackground)) + geom_point()

-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
-       main = "number of genes to correct",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes identified as contamination")
+
+  p2= ggplot(ambientProfile, aes(x=cutoffValue, y=genesContaminating)) + geom_point()
+
+  p3 = ggplot(ambientProfile, aes(x=cutoffValue, y=nEmptyDroplets)) + geom_point()
+
+  grid.arrange(p1, p2, p3, nrow = 3)
+
+}
+
+# I noticed that the number of genes removed tends to even out over time
+# Test whether the point where this first happens is a good empty cutoff point
+recommend.empty.cutoff = function(ambientProfile){
+  highestNumberOfGenes = max(ambientProfile[,3])
+  firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
+  return(as.numeric(rownames(ambientProfile[firstOccurence,])))
 }


@ -139,5 +259,3 @@ plot.ambient.profile = function(ambientProfile){



-
-
--- a/R/FastCAR_Run_correction.R
+++ b/R/FastCAR_Run_correction.R
@ -1,41 +0,0 @@
-###############################################################################
-# FastCAR package
-# Marijn Berg
-# m.berg@umcg.nl
-###############################################################################
-# FastCAR removes ambient RNA from 10X data by looking at the profile of the
-# empty droplets and removing per gene the highest value found in the ambient
-# RNA if it's found to contaminate more than a certain threshold of droplets
-###############################################################################
-
-# This script runs a simple correction with standard settings
-library(Matrix)
-library(Seurat)
-library(qlcMatrix)
-source("R/FastCAR_Base.R")
-
-emptyDropletCutoff        = 100  # Droplets with fewer than this number of UMIs are considered empty
-contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination
-
-cellExpressionFolder  = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/")
-fullMatrixFolder      = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/")
-
-# This folder will contain the corrected cell matrix
-correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix")
-
-# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later
-cellMatrix     = read.cell.matrix(cellExpressionFolder)
-fullMatrix     = read.full.matrix(fullMatrixFolder)
-
-###############################################################################
-# This is an optional function that will show the effects of different cutoff values
-# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop
-ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
-plot.ambient.profile(ambProfile)
-###############################################################################
-
-ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
-
-cellMatrix     = remove.background(cellMatrix, ambientProfile)
-
-write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
--- a/R/FastCAR_profiling_functions.R
+++ b/R/FastCAR_profiling_functions.R
@ -1,65 +0,0 @@
-###############################################################################
-# FastCAR package
-# Marijn Berg
-# m.berg@umcg.nl
-###############################################################################
-# FastCAR removes ambient RNA from 10X data by looking at the profile of the
-# empty droplets and removing per gene the highest value found in the ambient
-# RNA if it's found to contaminate more than a certain threshold of droplets
-###############################################################################
-# This script contains functions to profile the Ambient RNA to suggest
-# good settings to use to find genes to correct for
-
-###############################################################################
-library(Matrix)
-library(Seurat)
-library(qlcMatrix)
-###############################################################################
-
-# describe the number of genes identified in the background
-# and the number of genes failing the contaminiation chance threshold
-#
-describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
-  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
-  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
-  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))
-
-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
-  rownames(ambientDescriptions) = seq(start, stop, by)
-  for(emptyCutoff in seq(start, stop, by)){
-    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
-
-    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
-
-    #probability of a background read of a gene ending up in a cell
-    probabiltyCellContaminationPerGene = occurences / nEmpty
-    nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)
-
-    nGenes = sum(occurences != 0)
-    ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
-  }
-  return(ambientDescriptions)
-}
-
-
-
-plot.ambient.profile = function(ambientProfile){
-  par(mfrow = c(3,1))
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
-       main = "Total number of empty droplets at cutoffs",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Number of empty droplets")
-
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
-       main = "Number of genes in ambient RNA",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes in empty droplets")
-
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
-       main = "number of genes to correct",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes identified as contamination")
-}
-
-
-
--- a/R/tmp.R
+++ b/R/tmp.R
@ -1,102 +0,0 @@
-cutoffEmpty = 100   # set this to NA to determine dynamicallly
-
-###############################################################################
-
-# remove this or things will keep getting added to it
-rm(biopsySeurat, backGroundOverviews)
-
-for(folder in dataFolders[1]){
-  gc()
-  donor  = str_sub(folder, -7,-1)
-  sample = str_sub(folder, 19)
-  sample = str_sub(sample, 1, -9)
-
-  allExpression  = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = ""))
-  colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "")
-  # change barcode ids to match r object
-
-  if(is.na(cutoffEmpty)){
-    ##########################################
-    # determine the number of genes in the empty droplets at various cutoffs
-
-    foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps)))
-    names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps))
-
-    for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){
-      nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue])
-      nExpressedCells[nExpressedCells > 0] = 1
-      foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells)
-    }
-    write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv"))
-    ##########################################
-    # Find the point where increase in the number of genes decreases the most
-
-    cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))]))
-
-    # cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value
-
-    ##########################################
-    png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800)
-    plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes,   main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <")
-    segments(cutoffEmpty, 0 ,cutoffEmpty, 25000)
-    dev.off()
-  }
-
-  # read this in as a full matrix, haven't figgered out how to do this on a sparse matrix
-  cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]]
-
-  if(!exists("biopsySeurat")){
-    biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
-    biopsySeurat[["orig.ident"]] = donor
-    biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-")
-    # biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) )
-  }else{
-    tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
-    tmpbiopsySeurat[["orig.ident"]] = donor
-
-    tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-")
-    # tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE))
-
-    biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat)
-  }
-
-
-  backgroundTotal  = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty])
-  backGroundMax   = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty]))
-
-  nCell = ncol(cellExpression)
-  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
-  nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2]
-  occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0)
-
-  #probability of a background read of a gene ending up in a cell
-  pCell = occurences / nEmpty
-
-  write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = ""))
-  write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = ""))
-
-
-
-  # set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero)
-  backGroundMax[pCell < acceptableFractionContaminated] = 0
-
-  # remove the background
-  cellExpression = apply(cellExpression , 2, '-', backGroundMax)
-  cellExpression[cellExpression < 0] = 0
-  # cellExpression = as.sparse(cellExpression)
-
-  if(!exists("biopsySeuratAC")){
-    biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
-    biopsySeuratAC[["orig.ident"]] = donor
-    biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-")
-    # biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) )
-  }else{
-    tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
-    tmpbiopsySeuratAC[["orig.ident"]] = donor
-
-    tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-")
-    # tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE))
-
-    biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC)
-  }
-}
--- a/README.md
+++ b/README.md
@ -1,3 +1,140 @@
 # FastCAR

-Repo for the FastCAR (Fast Correction of Ambient RNA) R package and maybe  eventual python library
+FastCAR is an R package to remove ambient RNA from cells in droplet based single cell RNA sequencing data.
+
+
+### Installation
+
+FastCAR can be installed from git with the following command.
+
+```
+devtools::install_git("https://git.web.rug.nl/P278949/FastCAR")
+```
+
+Running FastCAR is quite simple.
+First load the library and dependencies.
+
+```
+library(Matrix)
+library(Seurat)
+library(qlcMatrix)
+library(pheatmap)
+library(ggplot2)
+library(gridExtra)
+```
+Specify the locations of the expression matrices
+
+```
+cellExpressionFolder  = c("Cellranger_output/sample1/filtered_feature_bc_matrix/")
+fullMatrixFolder      = c("Cellranger_output/sample1/raw_feature_bc_matrix/")
+```
+Load both the cell matrix and the full matrix
+```
+cellMatrix     = read.cell.matrix(cellExpressionFolder)
+fullMatrix     = read.full.matrix(fullMatrixFolder)
+```
+The following functions give an idea of the effect that different settings have on the ambient RNA profile. 
+These are optional as they do take a few minutes and the default settings work fine
+Plotting the number of empty droplets, the number of genes identified in the ambient RNA, and the number of genes that will be corrected for at different UMI cutoffs,
+
+```
+ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, 
+                                           start = 10, 
+                                           stop = 500, 
+                                           by = 10, 
+                                           contaminationChanceCutoff = 0.05)
+                                           
+plot.ambient.profile(ambProfile)
+``` 
+![picture](Images/Example_profile.png)
+
+
+The actual effect on the chances of genes affecting your DE analyses can be determined and visualized with the following function
+
+``` 
+  
+  correctionEffectProfile = describe.correction.effect(allExpression, cellExpression, 50, 500, 10, 0.05)
+  
+  plot.correction.effect.chance(correctionEffectProfile)
+  
+```
+
+![picture](Images/DE_affect_chance.png)
+
+
+
+
+How many reads will be removed of these genes can be visualized from the same profile
+```
+
+  plot.correction.effect.removal(correctionEffectProfile)
+
+``` 
+
+![picture](Images/Counts_removed.png)
+
+
+Set the empty droplet cutoff and the contamination chance cutoff
+
+The empty droplet cutoff is the number of UMIs a droplet can contain at the most to be considered empty.
+100 works fine but we tested this method in only one tissue. For other tissues these settings may need to be changed.
+Increasing this number also increases the highest possible value of expression of a given gene.
+As the correction will remove this value from every cell it is adviced not to set this too high and thereby overcorrect the expression in lowly expressing cells.
+
+The contamination chance cutoff is the allowed probability of a gene contaminating a cell. 
+As we developed FastCAR specifically for differential expression analyses between groups we suggest setting this such that not enough cells could be contaminated to affect this.
+In a cluster of a thousand cells divided into two groups there would be 2-3 cells per group with ambient RNA contamination of any given gene.
+Such low cell numbers are disregarded for differential expression analyses.
+
+There is an experimental function that gives a recommendation based on the ambient profiling results.
+This selects the first instance of the maximum number of genes being corrected for.
+I have no idea yet if this is actually a good idea.
+
+```
+emptyDropletCutoff = recommend.empty.cutoff(ambProfile)
+```
+
+
+```
+emptyDropletCutoff        = 150 
+contaminationChanceCutoff = 0.005
+```
+
+Determine the ambient RNA profile and remove the ambient RNA from each cell
+```
+ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
+cellMatrix     = remove.background(cellMatrix, ambientProfile)
+```
+
+This corrected matrix can be used to to make a Seurat object
+
+```
+seuratObject = CreateSeuratObject(cellMatrix) 
+```
+
+
+## Authors
+
+* **Marijn Berg** - m.berg@umcg.nl
+
+## License
+
+This project is licensed under the GPL-3 License - see the [LICENSE.md](LICENSE.md) file for details
+
+## Changelog
+
+### v0.1
+First fully working version of the R package
+
+### v0.2
+Fixed function to write the corrected matrix to file.
+Added readout of which genes will be corrected for and how many reads will be removed per cell
+Added some input checks to functions
+
+### v0.2
+Fixed a bug that caused FastCAR to be incompatible with biobase libraries
+Added better profiling to determine the effect of different settings on the corrections
+Swapped base R plots for ggplot2 plots
+
+
+
Author	SHA1	Message	Date
MarijnBerg	42b2d340c5	Added changes to readme	2021-11-01 16:41:25 +01:00
MarijnBerg	55b27c57a6	Swapped base R plots for ggplot	2021-11-01 14:42:49 +01:00
MarijnBerg	a620b4e5d2	Fixed bug that caused incompatibility with biobase due to having the same function name. Added new profiling functions.	2021-11-01 14:25:43 +01:00
MarijnBerg	0427dcc6ad	Removed uniused variable	2021-10-05 15:00:42 +02:00
Marijn	f4b6748210	Fixed variable name issue in remove.background function	2020-06-08 10:15:27 +02:00
Marijn	40765c3a42	updated readme and made the new function match the rest of the formatting	2020-04-27 15:50:48 +02:00
Marijn	c457e087be	Added recommended empty cutoff function	2020-04-27 13:40:04 +02:00
Marijn	48dc9d81e6	Fixed Readme	2020-03-26 15:19:45 +01:00
Marijn	92cb2a655b	Fixed a bug where base rowSums were used instead of Matrix rowSums	2020-03-26 15:10:24 +01:00
Marijn	c7a8c6a881	Updated Readme	2020-03-26 14:12:27 +01:00
Marijn	a47d8aa4c9	Updated Readme	2020-03-26 14:09:20 +01:00
Marijn	7b7fe13c75	Updated Readme	2020-03-26 12:08:24 +01:00
Marijn	e7ac549311	Updated Readme	2020-03-26 12:05:04 +01:00
Marijn	7a2607a091	Improved Readme	2020-03-26 11:51:08 +01:00
Marijn	1233c8a3be	Changed the DESCRIPTION to include dependencies	2020-03-26 10:49:56 +01:00
Marijn	c09281b51b	Made FastCAR into an actual R package, removed testing and running scripts	2020-03-26 10:32:38 +01:00