system_genetics/rnaseq/step6_overall_QC/__ - Preloader.R

190 lines
4.5 KiB
R

library(tidyverse)
library(ggfortify)
library(ggprism)
library(limma)
library(biomaRt)
library(PCAtools)
library(gtools)
library(edgeR)
library(ggprism)
library(foreign)
# Global variables
results.dir <- file.path("results.nosync", "RNA-Seq QC")
data.dir <- "Data"
patient.dir <- file.path(data.dir, "Patients")
sample.dir <- file.path(data.dir, "Samples")
expression.dir <- file.path(data.dir, "mRNA - RNA-Seq")
dir.create(results.dir, recursive = TRUE)
####
# Helper functions
####
select.columns.in.order <- function(dataframe, columns) {
dataframe[, columns]
}
select.rows.in.order <- function(dataframe, rows) {
dataframe[rows,]
}
getGenedataByEnsemblId38 <- function(ensemblIds, file.location) {
file.name <- file.path(file.location, "genes_info_hg38.csv")
if (!file.exists(file.name)) {
if (!("mart" %in% ls())) {
assign("mart", useEnsembl(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl"
))
}
gene.list <- getBM(
filters = "ensembl_gene_id",
attributes = c(
"hgnc_symbol",
"ensembl_gene_id",
"ensembl_transcript_id",
"chromosome_name",
"start_position",
"end_position",
"strand",
"transcription_start_site",
"transcript_start",
"transcript_end",
"external_gene_name"
),
values = as.character(ensemblIds),
mart = mart
)
readr::write_csv(gene.list, path = file.name)
}
return(
readr::read_csv(
file.name,
col_types = readr::cols()
)
)
}
#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) {
# dataframe %>%
# dplyr::filter(
# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount
# )
#}
limma.voom.convert.column <- function(dataframe, columnname) {
dataframe %>%
tibble::column_to_rownames(columnname) %>%
limma::voom() %>%
as.data.frame() %>%
tibble::rownames_to_column(columnname)
}
select.columns.in.order <- function(dataframe, columns) {
dataframe[, columns]
}
drop.columns.if.all.same.value <- function(dataframe) {
for (name in colnames(dataframe)) {
is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1
if (is.all.same) {
dataframe <- dataframe %>%
dplyr::select(
-tidyselect::one_of(name)
)
}
}
dataframe
}
# Load data
master.Table <- foreign::read.spss(
file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav")
) %>%
as.data.frame() %>%
dplyr::mutate(
GenomeScan_ID = stringr::str_trim(GenomeScan_ID),
gender = forcats::fct_recode(
Gender,
female = "f",
male = "m",
other = ""
),
age = as.numeric(Age),
smoking.status = forcats::fct_recode(
Smoking_status,
`Ex-smoker` = "ES ",
`Current smoker` = "CS ",
other = " "
)
)
expression.data <- readr::read_tsv(
file.path(expression.dir, "20200427_103972-001_rawcounts.txt"),
col_types = readr::cols()
)
gene.data <- getGenedataByEnsemblId38(
ensemblIds = expression.data$Gene,
file.location = expression.dir
) %>%
dplyr::group_by(hgnc_symbol) %>%
dplyr::filter(
dplyr::row_number() == 1,
!is.na(hgnc_symbol),
hgnc_symbol != ""
) %>%
dplyr::ungroup() %>%
dplyr::select(
hgnc_symbol,
ensembl_gene_id,
chromosome_name,
transcript_start,
transcript_end
)
master.Table <- master.Table %>%
dplyr::mutate(
Group_simple2 = stringr::str_trim(Group_simple2),
Group_simple = stringr::str_trim(Group_simple),
T_number = as.character(T_number),
sample.id = stringr::str_trim(PRESTO_ID)
) %>%
dplyr::filter(
# # According to Niek, I should not include this, for whatever reason
#!(GenomeScan_ID %in% c(
# "T02-01796",
# "T02-03095",
# "T02-10683",
# "T10-18671",
# "T12-12036"
# )
# ),
#
# # (According to Niek, don't include) Water Controls
#!stringr::str_detect(T_number, pattern="Water"),
#
# # (According to Niek, don't include)never smoker controles
#!(Group_simple2 == "NS_Ctrl"),
#
# # (According to Niek, don't include)ALFA1 patiënten
#!(Group_simple == "ALFA"),
#
#stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y",
GenomeScan_ID %in% colnames(expression.data),
!is.na(sample.id),
sample.id != ""
)
expression.data <- expression.data %>%
select.columns.in.order(
c("Gene", master.Table$GenomeScan_ID)
)