Made FastCAR into an actual R package, removed testing and running scripts

2020-03-26 10:32:38 +01:00
parent c3d24a4302
commit c09281b51b
5 changed files with 2 additions and 208 deletions
--- a/1
+++ b/1
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@@ -11,6 +11,7 @@
 ###############################################################################
 library(Matrix)
 library(Seurat)
+library(qlcMatrix)
 ###############################################################################

 # had to make this function to efficiently modify sparse matrices on a per gene basis
--- a/R/FastCAR_Run_correction.R
+++ b/R/FastCAR_Run_correction.R
@@ -1,41 +0,0 @@
-###############################################################################
-# FastCAR package
-# Marijn Berg
-# m.berg@umcg.nl
-###############################################################################
-# FastCAR removes ambient RNA from 10X data by looking at the profile of the
-# empty droplets and removing per gene the highest value found in the ambient
-# RNA if it's found to contaminate more than a certain threshold of droplets
-###############################################################################
-
-# This script runs a simple correction with standard settings
-library(Matrix)
-library(Seurat)
-library(qlcMatrix)
-source("R/FastCAR_Base.R")
-
-emptyDropletCutoff        = 100  # Droplets with fewer than this number of UMIs are considered empty
-contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination
-
-cellExpressionFolder  = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/")
-fullMatrixFolder      = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/")
-
-# This folder will contain the corrected cell matrix
-correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix")
-
-# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later
-cellMatrix     = read.cell.matrix(cellExpressionFolder)
-fullMatrix     = read.full.matrix(fullMatrixFolder)
-
-###############################################################################
-# This is an optional function that will show the effects of different cutoff values
-# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop
-ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
-plot.ambient.profile(ambProfile)
-###############################################################################
-
-ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
-
-cellMatrix     = remove.background(cellMatrix, ambientProfile)
-
-write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
--- a/R/FastCAR_profiling_functions.R
+++ b/R/FastCAR_profiling_functions.R
@@ -1,65 +0,0 @@
-###############################################################################
-# FastCAR package
-# Marijn Berg
-# m.berg@umcg.nl
-###############################################################################
-# FastCAR removes ambient RNA from 10X data by looking at the profile of the
-# empty droplets and removing per gene the highest value found in the ambient
-# RNA if it's found to contaminate more than a certain threshold of droplets
-###############################################################################
-# This script contains functions to profile the Ambient RNA to suggest
-# good settings to use to find genes to correct for
-
-###############################################################################
-library(Matrix)
-library(Seurat)
-library(qlcMatrix)
-###############################################################################
-
-# describe the number of genes identified in the background
-# and the number of genes failing the contaminiation chance threshold
-#
-describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
-  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
-  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
-  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))
-
-  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
-  rownames(ambientDescriptions) = seq(start, stop, by)
-  for(emptyCutoff in seq(start, stop, by)){
-    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
-
-    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
-
-    #probability of a background read of a gene ending up in a cell
-    probabiltyCellContaminationPerGene = occurences / nEmpty
-    nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)
-
-    nGenes = sum(occurences != 0)
-    ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
-  }
-  return(ambientDescriptions)
-}
-
-
-
-plot.ambient.profile = function(ambientProfile){
-  par(mfrow = c(3,1))
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
-       main = "Total number of empty droplets at cutoffs",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Number of empty droplets")
-
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
-       main = "Number of genes in ambient RNA",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes in empty droplets")
-
-  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
-       main = "number of genes to correct",
-       xlab = "empty droplet UMI cutoff",
-       ylab = "Genes identified as contamination")
-}
-
-
-
--- a/R/tmp.R
+++ b/R/tmp.R
@@ -1,102 +0,0 @@
-cutoffEmpty = 100   # set this to NA to determine dynamicallly
-
-###############################################################################
-
-# remove this or things will keep getting added to it
-rm(biopsySeurat, backGroundOverviews)
-
-for(folder in dataFolders[1]){
-  gc()
-  donor  = str_sub(folder, -7,-1)
-  sample = str_sub(folder, 19)
-  sample = str_sub(sample, 1, -9)
-
-  allExpression  = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = ""))
-  colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "")
-  # change barcode ids to match r object
-
-  if(is.na(cutoffEmpty)){
-    ##########################################
-    # determine the number of genes in the empty droplets at various cutoffs
-
-    foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps)))
-    names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps))
-
-    for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){
-      nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue])
-      nExpressedCells[nExpressedCells > 0] = 1
-      foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells)
-    }
-    write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv"))
-    ##########################################
-    # Find the point where increase in the number of genes decreases the most
-
-    cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))]))
-
-    # cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value
-
-    ##########################################
-    png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800)
-    plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes,   main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <")
-    segments(cutoffEmpty, 0 ,cutoffEmpty, 25000)
-    dev.off()
-  }
-
-  # read this in as a full matrix, haven't figgered out how to do this on a sparse matrix
-  cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]]
-
-  if(!exists("biopsySeurat")){
-    biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
-    biopsySeurat[["orig.ident"]] = donor
-    biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-")
-    # biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) )
-  }else{
-    tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
-    tmpbiopsySeurat[["orig.ident"]] = donor
-
-    tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-")
-    # tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE))
-
-    biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat)
-  }
-
-
-  backgroundTotal  = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty])
-  backGroundMax   = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty]))
-
-  nCell = ncol(cellExpression)
-  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
-  nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2]
-  occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0)
-
-  #probability of a background read of a gene ending up in a cell
-  pCell = occurences / nEmpty
-
-  write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = ""))
-  write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = ""))
-
-
-
-  # set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero)
-  backGroundMax[pCell < acceptableFractionContaminated] = 0
-
-  # remove the background
-  cellExpression = apply(cellExpression , 2, '-', backGroundMax)
-  cellExpression[cellExpression < 0] = 0
-  # cellExpression = as.sparse(cellExpression)
-
-  if(!exists("biopsySeuratAC")){
-    biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
-    biopsySeuratAC[["orig.ident"]] = donor
-    biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-")
-    # biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) )
-  }else{
-    tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
-    tmpbiopsySeuratAC[["orig.ident"]] = donor
-
-    tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-")
-    # tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE))
-
-    biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC)
-  }
-}