Improved Readme

2020-03-26 11:50:21 +01:00
6 changed files with 86 additions and 244 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -1,2 +0,0 @@
-^.*\.Rproj$
-^\.Rproj\.user$
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +0,0 @@
-.Rproj.user
-.Rhistory
-.RData
-.Ruserdata
--- a/Images/Counts_removed.png
+++ b/Images/Counts_removed.png
--- a/Images/DE_affect_chance.png
+++ b/Images/DE_affect_chance.png
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@ -7,15 +7,11 @@
 # empty droplets and removing per gene the highest value found in the ambient
 # RNA if it's found to contaminate more than a certain threshold of droplets
 ###############################################################################
-# TODO
-#
+# This file contains base functions
 ###############################################################################
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
-library(pheatmap)
-library(ggplot2)
-library(gridExtra)

 ###############################################################################

@ -26,21 +22,13 @@ library(gridExtra)
 # p: the where in 'i' and 'x' a new column starts
 # Dimnames: The names of the rows and columns
 remove.background = function(geneCellMatrix, ambientRNAprofile){
-  # do some input checks first
-  if(!("dgCMatrix" %in% class(geneCellMatrix))){
-    cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
-    stop()
-  }
-
-
-  # Here is the actual functionality
-  for(gene in names(ambientRNAprofile[ambientRNAprofile > 0])){
+  for(gene in names(ambientProfile[ambientProfile > 0])){
    # Determine the locations where the gene is not zero, therefore referenced in i
    iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
    # Determine the location of the actual values,
    xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
    # Remove the contaminating RNA
-    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientRNAprofile[gene]
+    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene]
  }
  # correct for instances where the expression was corrected to below zero
  geneCellMatrix@x[geneCellMatrix@x < 0] = 0
@ -49,16 +37,17 @@ remove.background = function(geneCellMatrix, ambientRNAprofile){
 }
 ##############

-determine.background.to.remove = function(fullCellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){

  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
-  backGroundMax   = as.vector(qlcMatrix::rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)
+  nCell = ncol(cellMatrix)

  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
  # rowSum on a logical statement returns the number of TRUE occurences
-  occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
+  occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)

  #probability of a background read of a gene ending up in a cell
  probabiltyCellContaminationPerGene = occurences / nEmpty
@ -82,127 +71,36 @@ read.full.matrix = function(fullFolderLocation){
  return(fullMatrix)
 }

-###############################################################################
-getExpressionThreshold = function(gene, expMat, percentile){
-  orderedExpression = expMat[gene, order(expMat[gene,], decreasing = TRUE)]
-  CS = cumsum(orderedExpression)
-  return(orderedExpression[which(CS/max(CS) > percentile)[1]])
-}
-
-###############################################################################
-celltypeSpecificityScore = function(gene, expMat){
-  CS = cumsum(expMat[gene, order(expMat[gene,], decreasing = TRUE)])
-  return((sum(CS/max(CS))/ncol(expMat))  )
-}
-
-###############################################################################
-describe.correction.effect = function (allExpression, cellExpression, startPos, stopPos, byLength, contaminationChanceCutoff){
-
-  # Make somewhere to store all the data that needs to be returned to the user
-  ambientScoreProfileOverview = data.frame(row.names = rownames(cellExpression))
-
-  # do a quick first run to see which genes get corrected at the highest setting
-  ambientProfile = determine.background.to.remove(allExpression, cellExpression, stopPos, contaminationChanceCutoff)
-  genelist = names(ambientProfile[ambientProfile > 0])
-
-  print(paste0("Calculating cell expression score for ", length(genelist), " genes"))
-  ctsScores = vector(mode = "numeric", length = nrow(cellExpression))
-  names(ctsScores) = rownames(cellExpression)
-
-  for(gene in genelist){
-    ctsScores[gene] = celltypeSpecificityScore(gene, cellExpression)
-  }
-
-  # loop over every threshold to test
-  # Starts at the highest value so
-  for(emptyDropletCutoff in seq(from = startPos, to = stopPos, by = byLength)){
-    ambientProfile = determine.background.to.remove(allExpression, cellExpression, emptyDropletCutoff, contaminationChanceCutoff)
-
-    print(paste0("Profiling at cutoff ", emptyDropletCutoff))
-
-    ambientScoreProfile = data.frame(ambientProfile, ctsScores)
-    #ambientScoreProfile = ambientScoreProfile[ambientScoreProfile$ctsScores > 0.85, ]
-    ambientScoreProfile$stillOverAmbient = 0
-    ambientScoreProfile$belowCellexpression = 0
-
-    genesToCheck = names(ambientProfile[ambientProfile > 0])
-    if(exists("overAmbientGenes")){
-      genesToCheck = overAmbientGenes
-    }
-
-    print(paste0("Calculating profiles for ", length(genesToCheck), " genes"))
-
-    for(gene in genesToCheck){
-      expThresh = getExpressionThreshold(gene, cellExpression, 0.95)
-
-      if(emptyDropletCutoff == startPos){
-        ambientScoreProfile[gene, "belowCellexpression"] = table(cellExpression[gene,] > 0  & cellExpression[gene,] < expThresh)["TRUE"]
-      }
-      ambientScoreProfile[gene, "stillOverAmbient"] =  table(cellExpression[gene,] > ambientScoreProfile[gene, "ambientProfile"]  & cellExpression[gene,] < expThresh)["TRUE"]
-    }
-
-    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
-    ambientScoreProfile$contaminationChance = ambientScoreProfile$stillOverAmbient / ambientScoreProfile$belowCellexpression
-    ambientScoreProfile[is.na(ambientScoreProfile)] = 0
-
-    # Genes that have already been completely removed don't need to be checked at higher resolution
-    overAmbientGenes = rownames(ambientScoreProfile[ambientScoreProfile$stillOverAmbient > 0,])
-
-    ambientScoreProfile[genelist,"AmbientCorrection"]
-
-    ambientScoreProfileOverview[names(ctsScores), "ctsScores"] = ctsScores
-    if(emptyDropletCutoff == startPos){
-      ambientScoreProfileOverview[rownames(ambientScoreProfile), "belowCellexpression"] = ambientScoreProfile$belowCellexpression
-    }
-    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("stillOverAmbient", as.character(emptyDropletCutoff))] = ambientScoreProfile$stillOverAmbient
-    ambientScoreProfileOverview[rownames(ambientScoreProfile), paste0("AmbientCorrection", as.character(emptyDropletCutoff))] = ambientScoreProfile$ambientProfile
-  }
-  ambientScoreProfileOverview = ambientScoreProfileOverview[!is.na(ambientScoreProfileOverview$ctsScores),]
-  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
-
-  ambientScoreProfileOverview[,paste0("Threshold_", seq(from = startPos, to = stopPos, by = byLength))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
-  ambientScoreProfileOverview[is.na(ambientScoreProfileOverview)] = 0
-
-  ambientScoreProfileOverview[,paste0("contaminationChance", as.character(seq(from = startPos, to = stopPos, by = byLength)))] = ambientScoreProfileOverview[,paste0("stillOverAmbient", as.character(seq(from = startPos, to = stopPos, by = byLength)))] / ambientScoreProfileOverview$belowCellexpression
-
-  return(ambientScoreProfileOverview)
-}
-
-
 ##############
-# Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
-# TODO
-# Commented out until I find a fix

-# write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
-#   dir.create(folderLocation)
-#
-#   writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
-#
-#   # save genes and cells names
-#   write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
-#   write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
-#
-#   correctedSignal = correctedSignal[correctedSignal > 0]
-#   write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
-#
-# }
+write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
+  dir.create(folderLocation)
+
+  writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
+
+  # save genes and cells names
+  write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
+  write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
+
+  correctedSignal = correctedSignal[correctedSignal > 0]
+  write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
+
+}

 # describe the number of genes identified in the background
-# and the number of genes failing the contamination chance threshold
+# and the number of genes failing the contaminiation chance threshold
 #
 describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
-  cutoffValue = seq(start, stop, by)
  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))

-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating, cutoffValue)
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
  rownames(ambientDescriptions) = seq(start, stop, by)
  for(emptyCutoff in seq(start, stop, by)){
    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]

-    occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
+    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)

    #probability of a background read of a gene ending up in a cell
    probabiltyCellContaminationPerGene = occurences / nEmpty
@ -215,39 +113,23 @@ describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contam
 }


-plot.correction.effect.chance = function(correctionProfile){
-  pheatmap(correctionProfile[correctionProfile[,3] > 0, colnames(correctionProfile)[grep("contaminationChance", colnames(correctionProfile))]],
-           cluster_cols = FALSE,
-           treeheight_row = 0,
-           main = "Chance of affecting DE analyses")
-}
-
-plot.correction.effect.removal = function(correctionProfile){
-  pheatmap(correctionProfile[(correctionProfile[,3] > 0) ,colnames(correctionProfile)[grep("AmbientCorrection", colnames(correctionProfile))]],
-           cluster_cols = FALSE, treeheight_row = 0,
-           main = "Counts removed from each cell")
-}
-

 plot.ambient.profile = function(ambientProfile){
+  par(mfrow = c(3,1))
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
+       main = "Total number of empty droplets at cutoffs",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Number of empty droplets")

-  p1 = ggplot(ambientProfile, aes(x=cutoffValue, y=genesInBackground)) + geom_point()
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
+       main = "Number of genes in ambient RNA",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes in empty droplets")

-
-  p2= ggplot(ambientProfile, aes(x=cutoffValue, y=genesContaminating)) + geom_point()
-
-  p3 = ggplot(ambientProfile, aes(x=cutoffValue, y=nEmptyDroplets)) + geom_point()
-
-  grid.arrange(p1, p2, p3, nrow = 3)
-
-}
-
-# I noticed that the number of genes removed tends to even out over time
-# Test whether the point where this first happens is a good empty cutoff point
-recommend.empty.cutoff = function(ambientProfile){
-  highestNumberOfGenes = max(ambientProfile[,3])
-  firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
-  return(as.numeric(rownames(ambientProfile[firstOccurence,])))
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
+       main = "number of genes to correct",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes identified as contamination")
 }


@ -259,3 +141,5 @@ recommend.empty.cutoff = function(ambientProfile){



+
+
--- a/README.md
+++ b/README.md
@ -2,10 +2,21 @@

 FastCAR is an R package to remove ambient RNA from cells in droplet based single cell RNA sequencing data.

+## Getting Started

-### Installation
+These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.

-FastCAR can be installed from git with the following command.
+### Prerequisites
+
+What things you need to install the software and how to install them
+
+```
+Give examples
+```
+
+### Installing
+
+FastCAR can be install from git with the following command.

 ```
 devtools::install_git("https://git.web.rug.nl/P278949/FastCAR")
@ -18,123 +29,76 @@ First load the library and dependencies.
 library(Matrix)
 library(Seurat)
 library(qlcMatrix)
-library(pheatmap)
-library(ggplot2)
-library(gridExtra)
-```
-Specify the locations of the expression matrices
+library(FastCAR)

 ```
+
+
+
+```
+
 cellExpressionFolder  = c("Cellranger_output/sample1/filtered_feature_bc_matrix/")
 fullMatrixFolder      = c("Cellranger_output/sample1/raw_feature_bc_matrix/")
+
 ```
-Load both the cell matrix and the full matrix
+
+
+
 ```
+# This folder will contain the corrected cell matrix
+correctedMatrixFolder = c("Cellranger_output/sample1/corrected_feature_bc_matrix")
+
+
 cellMatrix     = read.cell.matrix(cellExpressionFolder)
 fullMatrix     = read.full.matrix(fullMatrixFolder)
-```
-The following functions give an idea of the effect that different settings have on the ambient RNA profile. 
-These are optional as they do take a few minutes and the default settings work fine
-Plotting the number of empty droplets, the number of genes identified in the ambient RNA, and the number of genes that will be corrected for at different UMI cutoffs,

 ```
-ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, 
-                                           start = 10, 
-                                           stop = 500, 
-                                           by = 10, 
-                                           contaminationChanceCutoff = 0.05)
-                                           
+
+The following functions give an idea of the effect that different settings have on the ambient RNA profile
+
+```
+ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
 plot.ambient.profile(ambProfile)
-``` 
-![picture](Images/Example_profile.png)
-
-
-The actual effect on the chances of genes affecting your DE analyses can be determined and visualized with the following function

 ``` 
-  
-  correctionEffectProfile = describe.correction.effect(allExpression, cellExpression, 50, 500, 10, 0.05)
-  
-  plot.correction.effect.chance(correctionEffectProfile)
-  
-```
-
-![picture](Images/DE_affect_chance.png)
+![picture](https://git.web.rug.nl/P278949/FastCAR/src/branch/master/Images/Example_profile)



-
-How many reads will be removed of these genes can be visualized from the same profile
-```
-
-  plot.correction.effect.removal(correctionEffectProfile)
-
-``` 
-
-![picture](Images/Counts_removed.png)
-
-
-Set the empty droplet cutoff and the contamination chance cutoff
-
-The empty droplet cutoff is the number of UMIs a droplet can contain at the most to be considered empty.
-100 works fine but we tested this method in only one tissue. For other tissues these settings may need to be changed.
-Increasing this number also increases the highest possible value of expression of a given gene.
-As the correction will remove this value from every cell it is adviced not to set this too high and thereby overcorrect the expression in lowly expressing cells.
-
-The contamination chance cutoff is the allowed probability of a gene contaminating a cell. 
-As we developed FastCAR specifically for differential expression analyses between groups we suggest setting this such that not enough cells could be contaminated to affect this.
-In a cluster of a thousand cells divided into two groups there would be 2-3 cells per group with ambient RNA contamination of any given gene.
-Such low cell numbers are disregarded for differential expression analyses.
-
-There is an experimental function that gives a recommendation based on the ambient profiling results.
-This selects the first instance of the maximum number of genes being corrected for.
-I have no idea yet if this is actually a good idea.
+Set 

 ```
-emptyDropletCutoff = recommend.empty.cutoff(ambProfile)
-```

+emptyDropletCutoff        = 100 
+contaminationChanceCutoff = 0.05

-```
-emptyDropletCutoff        = 150 
-contaminationChanceCutoff = 0.005
 ```

-Determine the ambient RNA profile and remove the ambient RNA from each cell
 ```
 ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
 cellMatrix     = remove.background(cellMatrix, ambientProfile)
-```

-This corrected matrix can be used to to make a Seurat object

 ```
-seuratObject = CreateSeuratObject(cellMatrix) 
+
+Finally write the corrected cell/gene matrix to a file, this matrix can be used in Seurat the same way as any other cell/gene matrix.
+
 ```

+write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
+
+```
+
+End with an example of getting some data out of the system or using it for a little demo
+
+## Running the tests
+

 ## Authors

-* **Marijn Berg** - m.berg@umcg.nl
+* **Marijn Berg** - *Initial work* 

 ## License

 This project is licensed under the GPL-3 License - see the [LICENSE.md](LICENSE.md) file for details

-## Changelog
-
-### v0.1
-First fully working version of the R package
-
-### v0.2
-Fixed function to write the corrected matrix to file.
-Added readout of which genes will be corrected for and how many reads will be removed per cell
-Added some input checks to functions
-
-### v0.2
-Fixed a bug that caused FastCAR to be incompatible with biobase libraries
-Added better profiling to determine the effect of different settings on the corrections
-Swapped base R plots for ggplot2 plots
-
-
-