Merge https://git.web.rug.nl/P278949/FastCAR

First commit of FastCAR
Commiting just the actual package code
2020-03-25 15:27:48 +01:00 · 2020-03-25 15:18:06 +01:00
6 changed files with 374 additions and 0 deletions
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+Package: FastCAR
+Type: Package
+Title: What the Package Does (Title Case)
+Version: 0.1.0
+Author: Who wrote it
+Maintainer: The package maintainer <yourself@somewhere.net>
+Description: More about what it does (maybe more than one line)
+    Use four spaces when indenting paragraphs within the Description.
+License: What license is it under?
+Encoding: UTF-8
+LazyData: true
--- a/R/FastCAR_Base.R
+++ b/R/FastCAR_Base.R
@@ -0,0 +1,143 @@
+###############################################################################
+# FastCAR package
+# Marijn Berg
+# m.berg@umcg.nl
+###############################################################################
+# FastCAR removes ambient RNA from 10X data by looking at the profile of the
+# empty droplets and removing per gene the highest value found in the ambient
+# RNA if it's found to contaminate more than a certain threshold of droplets
+###############################################################################
+# This file contains base functions
+###############################################################################
+library(Matrix)
+library(Seurat)
+###############################################################################
+
+# had to make this function to efficiently modify sparse matrices on a per gene basis
+# A dgCMatrix object has the following elements that matter for this function
+# i: a sequence of the row locations of each non-zero element
+# x: the non-zero values in the matrix
+# p: the where in 'i' and 'x' a new column starts
+# Dimnames: The names of the rows and columns
+remove.background = function(geneCellMatrix, ambientRNAprofile){
+  for(gene in names(ambientProfile[ambientProfile > 0])){
+    # Determine the locations where the gene is not zero, therefore referenced in i
+    iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
+    # Determine the location of the actual values,
+    xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
+    # Remove the contaminating RNA
+    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientProfile[gene]
+  }
+  # correct for instances where the expression was corrected to below zero
+  geneCellMatrix@x[geneCellMatrix@x < 0] = 0
+  # remove the zeroes and return the corrected matrix
+  return(drop0(geneCellMatrix, is.Csparse = TRUE))
+}
+##############
+
+determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){
+
+  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
+  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
+  names(backGroundMax) = rownames(fullCellMatrix)
+  nCell = ncol(cellMatrix)
+
+  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
+  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
+  # rowSum on a logical statement returns the number of TRUE occurences
+  occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)
+
+  #probability of a background read of a gene ending up in a cell
+  probabiltyCellContaminationPerGene = occurences / nEmpty
+
+  # if the probablity of a gene contaminating a cell is too low we set the value to zero so it doesn't get corrected
+  backGroundMax[probabiltyCellContaminationPerGene < contaminationChanceCutoff] = 0
+  return(backGroundMax)
+}
+
+##############
+
+read.cell.matrix = function(cellFolderLocation){
+  cellMatrix = Read10X(cellFolderLocation)
+  return(cellMatrix)
+}
+
+##############
+
+read.full.matrix = function(fullFolderLocation){
+  fullMatrix = Read10X(fullFolderLocation)
+  return(fullMatrix)
+}
+
+##############
+
+write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
+  dir.create(folderLocation)
+
+  writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
+
+  # save genes and cells names
+  write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
+  write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
+
+  correctedSignal = correctedSignal[correctedSignal > 0]
+  write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
+
+}
+
+# describe the number of genes identified in the background
+# and the number of genes failing the contaminiation chance threshold
+#
+describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
+  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))
+
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
+  rownames(ambientDescriptions) = seq(start, stop, by)
+  for(emptyCutoff in seq(start, stop, by)){
+    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
+
+    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
+
+    #probability of a background read of a gene ending up in a cell
+    probabiltyCellContaminationPerGene = occurences / nEmpty
+    nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)
+
+    nGenes = sum(occurences != 0)
+    ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
+  }
+  return(ambientDescriptions)
+}
+
+
+
+plot.ambient.profile = function(ambientProfile){
+  par(mfrow = c(3,1))
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
+       main = "Total number of empty droplets at cutoffs",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Number of empty droplets")
+
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
+       main = "Number of genes in ambient RNA",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes in empty droplets")
+
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
+       main = "number of genes to correct",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes identified as contamination")
+}
+
+
+
+
+
+
+
+
+
+
+
+
--- a/R/FastCAR_Run_correction.R
+++ b/R/FastCAR_Run_correction.R
@@ -0,0 +1,41 @@
+###############################################################################
+# FastCAR package
+# Marijn Berg
+# m.berg@umcg.nl
+###############################################################################
+# FastCAR removes ambient RNA from 10X data by looking at the profile of the
+# empty droplets and removing per gene the highest value found in the ambient
+# RNA if it's found to contaminate more than a certain threshold of droplets
+###############################################################################
+
+# This script runs a simple correction with standard settings
+library(Matrix)
+library(Seurat)
+library(qlcMatrix)
+source("R/FastCAR_Base.R")
+
+emptyDropletCutoff        = 100  # Droplets with fewer than this number of UMIs are considered empty
+contaminationChanceCutoff = 0.05 # This is the maximum fraction of droplets containing the contamination
+
+cellExpressionFolder  = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/filtered_feature_bc_matrix/")
+fullMatrixFolder      = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/raw_feature_bc_matrix/")
+
+# This folder will contain the corrected cell matrix
+correctedMatrixFolder = c("/home/marijn/Analysis_Drive/R_Projects/scRNA_Background_correction/Cellranger_output/4951STDY7487591_ARMS054/corrected_feature_bc_matrix")
+
+# these are literally wrappers for the Seurat functions but it's good practice in case the function changes later
+cellMatrix     = read.cell.matrix(cellExpressionFolder)
+fullMatrix     = read.full.matrix(fullMatrixFolder)
+
+###############################################################################
+# This is an optional function that will show the effects of different cutoff values
+# start, stop, and by all refer to number of UMI, it determines the background ate steps of by, from start to stop
+ambProfile = describe.ambient.RNA.sequence(fullCellMatrix = fullMatrix, start = 10, stop = 500, by = 10, contaminationChanceCutoff = 0.05)
+plot.ambient.profile(ambProfile)
+###############################################################################
+
+ambientProfile = determine.background.to.remove(fullMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff)
+
+cellMatrix     = remove.background(cellMatrix, ambientProfile)
+
+write.corrected.matrix(cellMatrix, correctedMatrixFolder, ambientProfile)
--- a/R/FastCAR_profiling_functions.R
+++ b/R/FastCAR_profiling_functions.R
@@ -0,0 +1,65 @@
+###############################################################################
+# FastCAR package
+# Marijn Berg
+# m.berg@umcg.nl
+###############################################################################
+# FastCAR removes ambient RNA from 10X data by looking at the profile of the
+# empty droplets and removing per gene the highest value found in the ambient
+# RNA if it's found to contaminate more than a certain threshold of droplets
+###############################################################################
+# This script contains functions to profile the Ambient RNA to suggest
+# good settings to use to find genes to correct for
+
+###############################################################################
+library(Matrix)
+library(Seurat)
+library(qlcMatrix)
+###############################################################################
+
+# describe the number of genes identified in the background
+# and the number of genes failing the contaminiation chance threshold
+#
+describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
+  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
+  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))
+
+  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
+  rownames(ambientDescriptions) = seq(start, stop, by)
+  for(emptyCutoff in seq(start, stop, by)){
+    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
+
+    occurences = rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)
+
+    #probability of a background read of a gene ending up in a cell
+    probabiltyCellContaminationPerGene = occurences / nEmpty
+    nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)
+
+    nGenes = sum(occurences != 0)
+    ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
+  }
+  return(ambientDescriptions)
+}
+
+
+
+plot.ambient.profile = function(ambientProfile){
+  par(mfrow = c(3,1))
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
+       main = "Total number of empty droplets at cutoffs",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Number of empty droplets")
+
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
+       main = "Number of genes in ambient RNA",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes in empty droplets")
+
+  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
+       main = "number of genes to correct",
+       xlab = "empty droplet UMI cutoff",
+       ylab = "Genes identified as contamination")
+}
+
+
+
--- a/R/tmp.R
+++ b/R/tmp.R
@@ -0,0 +1,102 @@
+cutoffEmpty = 100   # set this to NA to determine dynamicallly
+
+###############################################################################
+
+# remove this or things will keep getting added to it
+rm(biopsySeurat, backGroundOverviews)
+
+for(folder in dataFolders[1]){
+  gc()
+  donor  = str_sub(folder, -7,-1)
+  sample = str_sub(folder, 19)
+  sample = str_sub(sample, 1, -9)
+
+  allExpression  = Read10X(paste(folder, "/raw_feature_bc_matrix", sep = ""))
+  colnames(allExpression) = paste(datasetInfo[sample, "prefix"], colnames(allExpression), sep = "")
+  # change barcode ids to match r object
+
+  if(is.na(cutoffEmpty)){
+    ##########################################
+    # determine the number of genes in the empty droplets at various cutoffs
+
+    foundNumberOfGenes = vector(mode = "numeric", length = length(seq(kneePointStart, kneePointStop, kneePointSteps)))
+    names(foundNumberOfGenes) = as.character(seq(kneePointStart, kneePointStop, kneePointSteps))
+
+    for(cutoffValue in seq(kneePointStart, kneePointStop, kneePointSteps)){
+      nExpressedCells = Matrix::rowSums(allExpression[, Matrix::colSums(allExpression) < cutoffValue])
+      nExpressedCells[nExpressedCells > 0] = 1
+      foundNumberOfGenes[as.character(cutoffValue)]= sum(nExpressedCells)
+    }
+    write.table(foundNumberOfGenes, file = paste("Foundgenes_cutoff_", donor, ".csv"))
+    ##########################################
+    # Find the point where increase in the number of genes decreases the most
+
+    cutoffEmpty = which(diff(foundNumberOfGenes) == max(diff(foundNumberOfGenes)[find_peaks(diff(foundNumberOfGenes))]))
+
+    # cutoffEmpty = as.numeric(names(which(diff(foundNumberOfGenes) == min(diff(foundNumberOfGenes)))))[1] # have to add this at the end, multiple changes can be the same value
+
+    ##########################################
+    png(paste("Empty_droplets_content_", donor, ".png", sep = ""), height = 400, width = 800)
+    plot(as.numeric(names(foundNumberOfGenes)), foundNumberOfGenes,   main = paste(donor, " unique genes per cutoff"), xlab = "cutoff <")
+    segments(cutoffEmpty, 0 ,cutoffEmpty, 25000)
+    dev.off()
+  }
+
+  # read this in as a full matrix, haven't figgered out how to do this on a sparse matrix
+  cellExpression = allExpression[,cellIDs[cellIDs %in% colnames(allExpression)]]
+
+  if(!exists("biopsySeurat")){
+    biopsySeurat <<- CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
+    biopsySeurat[["orig.ident"]] = donor
+    biopsySeurat[["percent.mt"]] <- PercentageFeatureSet(biopsySeurat, pattern = "^MT-")
+    # biopsySeurat <- subset(biopsySeurat, subset = percent.mt < quantile(biopsySeurat@meta.data$percent.mt, c(.9)) )
+  }else{
+    tmpbiopsySeurat = CreateSeuratObject(cellExpression, project = "Bronchial_Biopsy")
+    tmpbiopsySeurat[["orig.ident"]] = donor
+
+    tmpbiopsySeurat[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeurat, pattern = "^MT-")
+    # tmpbiopsySeurat <- subset(tmpbiopsySeurat, subset = percent.mt < quantile(tmpbiopsySeurat@meta.data$percent.mt, c(.9), na.rm = TRUE))
+
+    biopsySeurat = merge(biopsySeurat, tmpbiopsySeurat)
+  }
+
+
+  backgroundTotal  = Matrix::rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty])
+  backGroundMax   = as.vector(rowMax(allExpression[names(backgroundTotal),Matrix::colSums(allExpression) < cutoffEmpty]))
+
+  nCell = ncol(cellExpression)
+  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
+  nEmpty = table((Matrix::colSums(allExpression) < cutoffEmpty) &(Matrix::colSums(allExpression) > 0))[2]
+  occurences = rowSums(allExpression[,Matrix::colSums(allExpression) < cutoffEmpty] !=0)
+
+  #probability of a background read of a gene ending up in a cell
+  pCell = occurences / nEmpty
+
+  write.csv(pCell, file = paste("pCell_", donor, ".csv", sep = ""))
+  write.csv(backGroundMax, file = paste("bgMax_", donor, ".csv", sep = ""))
+
+
+
+  # set the background value for those genes that are unlikely to be a significant contaminant to 0 (zero)
+  backGroundMax[pCell < acceptableFractionContaminated] = 0
+
+  # remove the background
+  cellExpression = apply(cellExpression , 2, '-', backGroundMax)
+  cellExpression[cellExpression < 0] = 0
+  # cellExpression = as.sparse(cellExpression)
+
+  if(!exists("biopsySeuratAC")){
+    biopsySeuratAC <<- CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
+    biopsySeuratAC[["orig.ident"]] = donor
+    biopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(biopsySeuratAC, pattern = "^MT-")
+    # biopsySeuratAC <- subset(biopsySeuratAC, subset = percent.mt < quantile(biopsySeuratAC@meta.data$percent.mt, c(.9)) )
+  }else{
+    tmpbiopsySeuratAC = CreateSeuratObject(cellExpression, project = "AC_Bronchial_Biopsy")
+    tmpbiopsySeuratAC[["orig.ident"]] = donor
+
+    tmpbiopsySeuratAC[["percent.mt"]] <- PercentageFeatureSet(tmpbiopsySeuratAC, pattern = "^MT-")
+    # tmpbiopsySeuratAC <- subset(tmpbiopsySeuratAC, subset = percent.mt < quantile(tmpbiopsySeuratAC@meta.data$percent.mt, c(.9), na.rm = TRUE))
+
+    biopsySeuratAC = merge(biopsySeuratAC, tmpbiopsySeuratAC)
+  }
+}
--- a/man/hello.Rd
+++ b/man/hello.Rd
@@ -0,0 +1,12 @@
+\name{hello}
+\alias{hello}
+\title{Hello, World!}
+\usage{
+hello()
+}
+\description{
+Prints 'Hello, world!'.
+}
+\examples{
+hello()
+}
Author	SHA1	Message	Date
Marijn	c3d24a4302	Merge https://git.web.rug.nl/P278949/FastCAR First commit of FastCAR	2020-03-25 15:27:48 +01:00
Marijn	cd967fa821	Commiting just the actual package code	2020-03-25 15:18:06 +01:00