2021-02-09 12:52:44 +01:00
|
|
|
library(tidyverse)
|
|
|
|
library(ggfortify)
|
|
|
|
library(ggprism)
|
|
|
|
library(limma)
|
|
|
|
library(biomaRt)
|
|
|
|
library(PCAtools)
|
|
|
|
library(gtools)
|
|
|
|
library(edgeR)
|
|
|
|
library(ggprism)
|
|
|
|
library(foreign)
|
2021-02-09 15:39:35 +01:00
|
|
|
|
2021-02-09 12:52:44 +01:00
|
|
|
|
|
|
|
# Global variables
|
|
|
|
results.dir <- file.path("results.nosync", "RNA-Seq QC")
|
|
|
|
data.dir <- "Data"
|
|
|
|
patient.dir <- file.path(data.dir, "Patients")
|
|
|
|
sample.dir <- file.path(data.dir, "Samples")
|
|
|
|
expression.dir <- file.path(data.dir, "mRNA - RNA-Seq")
|
|
|
|
|
|
|
|
|
|
|
|
dir.create(results.dir, recursive = TRUE)
|
|
|
|
|
|
|
|
|
|
|
|
####
|
|
|
|
# Helper functions
|
|
|
|
####
|
|
|
|
select.columns.in.order <- function(dataframe, columns) {
|
|
|
|
dataframe[, columns]
|
|
|
|
}
|
|
|
|
|
|
|
|
select.rows.in.order <- function(dataframe, rows) {
|
|
|
|
dataframe[rows,]
|
|
|
|
}
|
|
|
|
|
|
|
|
getGenedataByEnsemblId38 <- function(ensemblIds, file.location) {
|
|
|
|
file.name <- file.path(file.location, "genes_info_hg38.csv")
|
|
|
|
if (!file.exists(file.name)) {
|
|
|
|
if (!("mart" %in% ls())) {
|
|
|
|
assign("mart", useEnsembl(
|
|
|
|
biomart = "ENSEMBL_MART_ENSEMBL",
|
|
|
|
dataset = "hsapiens_gene_ensembl"
|
|
|
|
))
|
|
|
|
}
|
|
|
|
gene.list <- getBM(
|
|
|
|
filters = "ensembl_gene_id",
|
|
|
|
attributes = c(
|
|
|
|
"hgnc_symbol",
|
|
|
|
"ensembl_gene_id",
|
|
|
|
"ensembl_transcript_id",
|
|
|
|
"chromosome_name",
|
|
|
|
"start_position",
|
|
|
|
"end_position",
|
|
|
|
"strand",
|
|
|
|
"transcription_start_site",
|
|
|
|
"transcript_start",
|
|
|
|
"transcript_end",
|
|
|
|
"external_gene_name"
|
|
|
|
),
|
|
|
|
values = as.character(ensemblIds),
|
|
|
|
mart = mart
|
|
|
|
)
|
|
|
|
readr::write_csv(gene.list, path = file.name)
|
|
|
|
}
|
|
|
|
return(
|
|
|
|
readr::read_csv(
|
|
|
|
file.name,
|
|
|
|
col_types = readr::cols()
|
|
|
|
)
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) {
|
|
|
|
# dataframe %>%
|
|
|
|
# dplyr::filter(
|
|
|
|
# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount
|
|
|
|
# )
|
|
|
|
#}
|
|
|
|
|
|
|
|
limma.voom.convert.column <- function(dataframe, columnname) {
|
|
|
|
dataframe %>%
|
|
|
|
tibble::column_to_rownames(columnname) %>%
|
|
|
|
limma::voom() %>%
|
|
|
|
as.data.frame() %>%
|
|
|
|
tibble::rownames_to_column(columnname)
|
|
|
|
}
|
|
|
|
|
|
|
|
select.columns.in.order <- function(dataframe, columns) {
|
|
|
|
dataframe[, columns]
|
|
|
|
}
|
|
|
|
|
|
|
|
drop.columns.if.all.same.value <- function(dataframe) {
|
|
|
|
for (name in colnames(dataframe)) {
|
|
|
|
is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1
|
|
|
|
if (is.all.same) {
|
|
|
|
dataframe <- dataframe %>%
|
|
|
|
dplyr::select(
|
|
|
|
-tidyselect::one_of(name)
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
dataframe
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Load data
|
|
|
|
master.Table <- foreign::read.spss(
|
|
|
|
file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav")
|
|
|
|
) %>%
|
|
|
|
as.data.frame() %>%
|
|
|
|
dplyr::mutate(
|
|
|
|
GenomeScan_ID = stringr::str_trim(GenomeScan_ID),
|
|
|
|
gender = forcats::fct_recode(
|
|
|
|
Gender,
|
|
|
|
female = "f",
|
|
|
|
male = "m",
|
|
|
|
other = ""
|
|
|
|
),
|
|
|
|
age = as.numeric(Age),
|
|
|
|
smoking.status = forcats::fct_recode(
|
|
|
|
Smoking_status,
|
|
|
|
`Ex-smoker` = "ES ",
|
|
|
|
`Current smoker` = "CS ",
|
|
|
|
other = " "
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
expression.data <- readr::read_tsv(
|
|
|
|
file.path(expression.dir, "20200427_103972-001_rawcounts.txt"),
|
|
|
|
col_types = readr::cols()
|
|
|
|
)
|
|
|
|
|
|
|
|
gene.data <- getGenedataByEnsemblId38(
|
|
|
|
ensemblIds = expression.data$Gene,
|
|
|
|
file.location = expression.dir
|
|
|
|
) %>%
|
|
|
|
dplyr::group_by(hgnc_symbol) %>%
|
|
|
|
dplyr::filter(
|
|
|
|
dplyr::row_number() == 1,
|
|
|
|
!is.na(hgnc_symbol),
|
|
|
|
hgnc_symbol != ""
|
|
|
|
) %>%
|
|
|
|
dplyr::ungroup() %>%
|
|
|
|
dplyr::select(
|
|
|
|
hgnc_symbol,
|
|
|
|
ensembl_gene_id,
|
|
|
|
chromosome_name,
|
|
|
|
transcript_start,
|
|
|
|
transcript_end
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
master.Table <- master.Table %>%
|
|
|
|
dplyr::mutate(
|
|
|
|
Group_simple2 = stringr::str_trim(Group_simple2),
|
|
|
|
Group_simple = stringr::str_trim(Group_simple),
|
|
|
|
T_number = as.character(T_number),
|
|
|
|
sample.id = stringr::str_trim(PRESTO_ID)
|
|
|
|
) %>%
|
|
|
|
dplyr::filter(
|
|
|
|
# # According to Niek, I should not include this, for whatever reason
|
|
|
|
#!(GenomeScan_ID %in% c(
|
|
|
|
# "T02-01796",
|
|
|
|
# "T02-03095",
|
|
|
|
# "T02-10683",
|
|
|
|
# "T10-18671",
|
|
|
|
# "T12-12036"
|
|
|
|
# )
|
|
|
|
# ),
|
|
|
|
#
|
|
|
|
# # (According to Niek, don't include) Water Controls
|
|
|
|
#!stringr::str_detect(T_number, pattern="Water"),
|
|
|
|
#
|
|
|
|
# # (According to Niek, don't include)never smoker controles
|
|
|
|
#!(Group_simple2 == "NS_Ctrl"),
|
|
|
|
#
|
|
|
|
# # (According to Niek, don't include)ALFA1 patiënten
|
|
|
|
#!(Group_simple == "ALFA"),
|
|
|
|
#
|
|
|
|
#stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y",
|
|
|
|
GenomeScan_ID %in% colnames(expression.data),
|
|
|
|
!is.na(sample.id),
|
|
|
|
sample.id != ""
|
|
|
|
)
|
|
|
|
|
|
|
|
expression.data <- expression.data %>%
|
|
|
|
select.columns.in.order(
|
|
|
|
c("Gene", master.Table$GenomeScan_ID)
|
|
|
|
)
|