commit cd967fa8211b20b59804f60a8c6947c87dc1b402 Author: Marijn Date: Wed Mar 25 15:18:06 2020 +0100 Commiting just the actual package code diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..e9a733f --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,11 @@ +Package: FastCAR +Type: Package +Title: What the Package Does (Title Case) +Version: 0.1.0 +Author: Who wrote it +Maintainer: The package maintainer +Description: More about what it does (maybe more than one line) + Use four spaces when indenting paragraphs within the Description. +License: What license is it under? +Encoding: UTF-8 +LazyData: true diff --git a/R/FastCAR_Base.R b/R/FastCAR_Base.R new file mode 100644 index 0000000..52e80d6 --- /dev/null +++ b/R/FastCAR_Base.R @@ -0,0 +1,143 @@ +############################################################################### +# FastCAR package +# Marijn Berg +# m.berg@umcg.nl +############################################################################### +# FastCAR removes ambient RNA from 10X data by looking at the profile of the +# empty droplets and removing per gene the highest value found in the ambient +# RNA if it's found to contaminate more than a certain threshold of droplets +############################################################################### +# This file contains base functions +############################################################################### +library(Matrix) +library(Seurat) +############################################################################### + +# had to make this function to efficiently modify sparse matrices on a per gene basis +# A dgCMatrix object has the following elements that matter for this function +# i: a sequence of the row locations of each non-zero element +# x: the non-zero values in the matrix +# p: the where in 'i' and 'x' a new column starts +# Dimnames: The names of the rows and columns +remove.background = function(geneCellMatrix, ambientRNAprofile){ + for(gene in names(ambientProfile[ambientProfile > 0])){ + # Determine the locations where the gene is not zero, therefore referenced in i + iLocs = which(geneCellMatrix@Dimnames[[1]] == gene) + # Determine the location of the actual values, + xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems + # Remove the contaminating RNA + geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene] + } + # correct for instances where the expression was corrected to below zero + geneCellMatrix@x[geneCellMatrix@x < 0] = 0 + # remove the zeroes and return the corrected matrix + return(drop0(geneCellMatrix, is.Csparse = TRUE)) +} +############## + +determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){ + + # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells + backGroundMax = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff])) + names(backGroundMax) = rownames(fullCellMatrix) + nCell = ncol(cellMatrix) + + # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them. + nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2] + # rowSum on a logical statement returns the number of TRUE occurences + occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0) + + #probability of a background read of a gene ending up in a cell + probabiltyCellContaminationPerGene = occurences / nEmpty + + # if the probablity of a gene contaminating a cell is too low we set the value to zero so it doesn't get corrected + backGroundMax[probabiltyCellContaminationPerGene < contaminationChanceCutoff] = 0 + return(backGroundMax) +} + +############## + +read.cell.matrix = function(cellFolderLocation){ + cellMatrix = Read10X(cellFolderLocation) + return(cellMatrix) +} + +############## + +read.full.matrix = function(fullFolderLocation){ + fullMatrix = Read10X(fullFolderLocation) + return(fullMatrix) +} + +############## + +write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){ + dir.create(folderLocation) + + writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = "")) + + # save genes and cells names + write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = "")) + write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = "")) + + correctedSignal = correctedSignal[correctedSignal > 0] + write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE) + +} + +# describe the number of genes identified in the background +# and the number of genes failing the contaminiation chance threshold +# +describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){ + genesInBackground = vector(mode = "numeric", length = length(seq(start, stop, by))) + genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by))) + nEmptyDroplets = vector(mode = "numeric", length = length(seq(start, stop, by))) + + ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating) + rownames(ambientDescriptions) = seq(start, stop, by) + for(emptyCutoff in seq(start, stop, by)){ + nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2] + + occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0) + + #probability of a background read of a gene ending up in a cell + probabiltyCellContaminationPerGene = occurences / nEmpty + nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff) + + nGenes = sum(occurences != 0) + ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold) + } + return(ambientDescriptions) +} + + + +plot.ambient.profile = function(ambientProfile){ + par(mfrow = c(3,1)) + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1], + main = "Total number of empty droplets at cutoffs", + xlab = "empty droplet UMI cutoff", + ylab = "Number of empty droplets") + + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2], + main = "Number of genes in ambient RNA", + xlab = "empty droplet UMI cutoff", + ylab = "Genes in empty droplets") + + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3], + main = "number of genes to correct", + xlab = "empty droplet UMI cutoff", + ylab = "Genes identified as contamination") +} + + + + + + + + + + + + diff --git a/R/FastCAR_Run_correction.R b/R/FastCAR_Run_correction.R new file mode 100644 index 0000000..11d4e3f --- /dev/null +++ b/R/FastCAR_Run_correction.R @@ -0,0 +1,41 @@ +############################################################################### +# FastCAR package +# Marijn Berg +# m.berg@umcg.nl +############################################################################### +# FastCAR removes ambient RNA from 10X data by looking at the profile of the +# empty droplets and removing per gene the highest value found in the ambient +# RNA if it's found to contaminate more than a certain threshold of droplets +############################################################################### + +# This script runs a simple correction with standard settings +library(Matrix) +library(Seurat) +library(qlcMatrix) +source("R/FastCAR_Base.R") + +emptyDropletCutoff = 100 # Droplets with fewer than this number of UMIs are considered empty +contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination + +cellExpressionFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/") +fullMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/") + +# This folder will contain the corrected cell matrix +correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix") + +# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later +cellMatrix = read.cell.matrix(cellExpressionFolder) +fullMatrix = read.full.matrix(fullMatrixFolder) + +############################################################################### +# This is an optional function that will show the effects of different cutoff values +# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop +ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05) +plot.ambient.profile(ambProfile) +############################################################################### + +ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff) + +cellMatrix = remove.background(cellMatrix, ambientProfile) + +write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile) diff --git a/R/FastCAR_profiling_functions.R b/R/FastCAR_profiling_functions.R new file mode 100644 index 0000000..e725ede --- /dev/null +++ b/R/FastCAR_profiling_functions.R @@ -0,0 +1,65 @@ +############################################################################### +# FastCAR package +# Marijn Berg +# m.berg@umcg.nl +############################################################################### +# FastCAR removes ambient RNA from 10X data by looking at the profile of the +# empty droplets and removing per gene the highest value found in the ambient +# RNA if it's found to contaminate more than a certain threshold of droplets +############################################################################### +# This script contains functions to profile the Ambient RNA to suggest +# good settings to use to find genes to correct for + +############################################################################### +library(Matrix) +library(Seurat) +library(qlcMatrix) +############################################################################### + +# describe the number of genes identified in the background +# and the number of genes failing the contaminiation chance threshold +# +describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){ + genesInBackground = vector(mode = "numeric", length = length(seq(start, stop, by))) + genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by))) + nEmptyDroplets = vector(mode = "numeric", length = length(seq(start, stop, by))) + + ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating) + rownames(ambientDescriptions) = seq(start, stop, by) + for(emptyCutoff in seq(start, stop, by)){ + nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2] + + occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0) + + #probability of a background read of a gene ending up in a cell + probabiltyCellContaminationPerGene = occurences / nEmpty + nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff) + + nGenes = sum(occurences != 0) + ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold) + } + return(ambientDescriptions) +} + + + +plot.ambient.profile = function(ambientProfile){ + par(mfrow = c(3,1)) + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1], + main = "Total number of empty droplets at cutoffs", + xlab = "empty droplet UMI cutoff", + ylab = "Number of empty droplets") + + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2], + main = "Number of genes in ambient RNA", + xlab = "empty droplet UMI cutoff", + ylab = "Genes in empty droplets") + + plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3], + main = "number of genes to correct", + xlab = "empty droplet UMI cutoff", + ylab = "Genes identified as contamination") +} + + + diff --git a/R/tmp.R b/R/tmp.R new file mode 100644 index 0000000..a7551b8 --- /dev/null +++ b/R/tmp.R @@ -0,0 +1,102 @@ +cutoffEmpty = 100 # set this to NA to determine dynamicallly + +############################################################################### + +# remove this or things will keep getting added to it +rm(biopsySeurat, backGroundOverviews) + +for(folder in dataFolders[1]){ + gc() + donor = str_sub(folder, -7,-1) + sample = str_sub(folder, 19) + sample = str_sub(sample, 1, -9) + + allExpression = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = "")) + colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "") + # change barcode ids to match r object + + if(is.na(cutoffEmpty)){ + ########################################## + # determine the number of genes in the empty droplets at various cutoffs + + foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps))) + names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps)) + + for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){ + nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue]) + nExpressedCells[nExpressedCells > 0] = 1 + foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells) + } + write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv")) + ########################################## + # Find the point where increase in the number of genes decreases the most + + cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))])) + + # cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value + + ########################################## + png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800) + plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes, main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <") + segments(cutoffEmpty, 0 ,cutoffEmpty, 25000) + dev.off() + } + + # read this in as a full matrix, haven't figgered out how to do this on a sparse matrix + cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]] + + if(!exists("biopsySeurat")){ + biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy") + biopsySeurat[["orig.ident"]] = donor + biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-") + # biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) ) + }else{ + tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy") + tmpbiopsySeurat[["orig.ident"]] = donor + + tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-") + # tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE)) + + biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat) + } + + + backgroundTotal = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty]) + backGroundMax = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty])) + + nCell = ncol(cellExpression) + # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them. + nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2] + occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0) + + #probability of a background read of a gene ending up in a cell + pCell = occurences / nEmpty + + write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = "")) + write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = "")) + + + + # set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero) + backGroundMax[pCell < acceptableFractionContaminated] = 0 + + # remove the background + cellExpression = apply(cellExpression , 2, '-', backGroundMax) + cellExpression[cellExpression < 0] = 0 + # cellExpression = as.sparse(cellExpression) + + if(!exists("biopsySeuratAC")){ + biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy") + biopsySeuratAC[["orig.ident"]] = donor + biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-") + # biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) ) + }else{ + tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy") + tmpbiopsySeuratAC[["orig.ident"]] = donor + + tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-") + # tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE)) + + biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC) + } +} diff --git a/man/hello.Rd b/man/hello.Rd new file mode 100644 index 0000000..0fa7c4b --- /dev/null +++ b/man/hello.Rd @@ -0,0 +1,12 @@ +\name{hello} +\alias{hello} +\title{Hello, World!} +\usage{ +hello() +} +\description{ +Prints 'Hello, world!'. +} +\examples{ +hello() +}