From c09281b51bae6bc7358d343320eadedf942940c9 Mon Sep 17 00:00:00 2001 From: Marijn Date: Thu, 26 Mar 2020 10:32:38 +0100 Subject: [PATCH] Made FastCAR into an actual R package, removed testing and running scripts --- FastCAR | 1 + R/FastCAR_Base.R | 1 + R/FastCAR_Run_correction.R | 41 ------------- R/FastCAR_profiling_functions.R | 65 -------------------- R/tmp.R | 102 -------------------------------- 5 files changed, 2 insertions(+), 208 deletions(-) create mode 160000 FastCAR delete mode 100644 R/FastCAR_Run_correction.R delete mode 100644 R/FastCAR_profiling_functions.R delete mode 100644 R/tmp.R diff --git a/FastCAR b/FastCAR new file mode 160000 index 0000000..2121cbd --- /dev/null +++ b/FastCAR @@ -0,0 +1 @@ +Subproject commit 2121cbd359630af53c002dd3e082621010111346 diff --git a/R/FastCAR_Base.R b/R/FastCAR_Base.R index 52e80d6..dda6b74 100644 --- a/R/FastCAR_Base.R +++ b/R/FastCAR_Base.R @@ -11,6 +11,7 @@ ############################################################################### library(Matrix) library(Seurat) +library(qlcMatrix) ############################################################################### # had to make this function to efficiently modify sparse matrices on a per gene basis diff --git a/R/FastCAR_Run_correction.R b/R/FastCAR_Run_correction.R deleted file mode 100644 index 11d4e3f..0000000 --- a/R/FastCAR_Run_correction.R +++ /dev/null @@ -1,41 +0,0 @@ -############################################################################### -# FastCAR package -# Marijn Berg -# m.berg@umcg.nl -############################################################################### -# FastCAR removes ambient RNA from 10X data by looking at the profile of the -# empty droplets and removing per gene the highest value found in the ambient -# RNA if it's found to contaminate more than a certain threshold of droplets -############################################################################### - -# This script runs a simple correction with standard settings -library(Matrix) -library(Seurat) -library(qlcMatrix) -source("R/FastCAR_Base.R") - -emptyDropletCutoff = 100 # Droplets with fewer than this number of UMIs are considered empty -contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination - -cellExpressionFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/") -fullMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/") - -# This folder will contain the corrected cell matrix -correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix") - -# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later -cellMatrix = read.cell.matrix(cellExpressionFolder) -fullMatrix = read.full.matrix(fullMatrixFolder) - -############################################################################### -# This is an optional function that will show the effects of different cutoff values -# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop -ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05) -plot.ambient.profile(ambProfile) -############################################################################### - -ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff) - -cellMatrix = remove.background(cellMatrix, ambientProfile) - -write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile) diff --git a/R/FastCAR_profiling_functions.R b/R/FastCAR_profiling_functions.R deleted file mode 100644 index e725ede..0000000 --- a/R/FastCAR_profiling_functions.R +++ /dev/null @@ -1,65 +0,0 @@ -############################################################################### -# FastCAR package -# Marijn Berg -# m.berg@umcg.nl -############################################################################### -# FastCAR removes ambient RNA from 10X data by looking at the profile of the -# empty droplets and removing per gene the highest value found in the ambient -# RNA if it's found to contaminate more than a certain threshold of droplets -############################################################################### -# This script contains functions to profile the Ambient RNA to suggest -# good settings to use to find genes to correct for - -############################################################################### -library(Matrix) -library(Seurat) -library(qlcMatrix) -############################################################################### - -# describe the number of genes identified in the background -# and the number of genes failing the contaminiation chance threshold -# -describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){ - genesInBackground = vector(mode = "numeric", length = length(seq(start, stop, by))) - genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by))) - nEmptyDroplets = vector(mode = "numeric", length = length(seq(start, stop, by))) - - ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating) - rownames(ambientDescriptions) = seq(start, stop, by) - for(emptyCutoff in seq(start, stop, by)){ - nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2] - - occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0) - - #probability of a background read of a gene ending up in a cell - probabiltyCellContaminationPerGene = occurences / nEmpty - nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff) - - nGenes = sum(occurences != 0) - ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold) - } - return(ambientDescriptions) -} - - - -plot.ambient.profile = function(ambientProfile){ - par(mfrow = c(3,1)) - plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1], - main = "Total number of empty droplets at cutoffs", - xlab = "empty droplet UMI cutoff", - ylab = "Number of empty droplets") - - plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2], - main = "Number of genes in ambient RNA", - xlab = "empty droplet UMI cutoff", - ylab = "Genes in empty droplets") - - plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3], - main = "number of genes to correct", - xlab = "empty droplet UMI cutoff", - ylab = "Genes identified as contamination") -} - - - diff --git a/R/tmp.R b/R/tmp.R deleted file mode 100644 index a7551b8..0000000 --- a/R/tmp.R +++ /dev/null @@ -1,102 +0,0 @@ -cutoffEmpty = 100 # set this to NA to determine dynamicallly - -############################################################################### - -# remove this or things will keep getting added to it -rm(biopsySeurat, backGroundOverviews) - -for(folder in dataFolders[1]){ - gc() - donor = str_sub(folder, -7,-1) - sample = str_sub(folder, 19) - sample = str_sub(sample, 1, -9) - - allExpression = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = "")) - colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "") - # change barcode ids to match r object - - if(is.na(cutoffEmpty)){ - ########################################## - # determine the number of genes in the empty droplets at various cutoffs - - foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps))) - names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps)) - - for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){ - nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue]) - nExpressedCells[nExpressedCells > 0] = 1 - foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells) - } - write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv")) - ########################################## - # Find the point where increase in the number of genes decreases the most - - cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))])) - - # cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value - - ########################################## - png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800) - plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes, main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <") - segments(cutoffEmpty, 0 ,cutoffEmpty, 25000) - dev.off() - } - - # read this in as a full matrix, haven't figgered out how to do this on a sparse matrix - cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]] - - if(!exists("biopsySeurat")){ - biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy") - biopsySeurat[["orig.ident"]] = donor - biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-") - # biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) ) - }else{ - tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy") - tmpbiopsySeurat[["orig.ident"]] = donor - - tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-") - # tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE)) - - biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat) - } - - - backgroundTotal = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty]) - backGroundMax = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty])) - - nCell = ncol(cellExpression) - # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them. - nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2] - occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0) - - #probability of a background read of a gene ending up in a cell - pCell = occurences / nEmpty - - write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = "")) - write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = "")) - - - - # set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero) - backGroundMax[pCell < acceptableFractionContaminated] = 0 - - # remove the background - cellExpression = apply(cellExpression , 2, '-', backGroundMax) - cellExpression[cellExpression < 0] = 0 - # cellExpression = as.sparse(cellExpression) - - if(!exists("biopsySeuratAC")){ - biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy") - biopsySeuratAC[["orig.ident"]] = donor - biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-") - # biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) ) - }else{ - tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy") - tmpbiopsySeuratAC[["orig.ident"]] = donor - - tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-") - # tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE)) - - biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC) - } -}