library(tidyverse) library(ggfortify) library(ggprism) library(limma) library(biomaRt) library(PCAtools) library(gtools) library(edgeR) library(ggprism) library(foreign) # Global variables results.dir <- file.path("results.nosync", "RNA-Seq QC") data.dir <- "Data" patient.dir <- file.path(data.dir, "Patients") sample.dir <- file.path(data.dir, "Samples") expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") dir.create(results.dir, recursive = TRUE) #### # Helper functions #### select.columns.in.order <- function(dataframe, columns) { dataframe[, columns] } select.rows.in.order <- function(dataframe, rows) { dataframe[rows,] } getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { file.name <- file.path(file.location, "genes_info_hg38.csv") if (!file.exists(file.name)) { if (!("mart" %in% ls())) { assign("mart", useEnsembl( biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl" )) } gene.list <- getBM( filters = "ensembl_gene_id", attributes = c( "hgnc_symbol", "ensembl_gene_id", "ensembl_transcript_id", "chromosome_name", "start_position", "end_position", "strand", "transcription_start_site", "transcript_start", "transcript_end", "external_gene_name" ), values = as.character(ensemblIds), mart = mart ) readr::write_csv(gene.list, path = file.name) } return( readr::read_csv( file.name, col_types = readr::cols() ) ) } #remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { # dataframe %>% # dplyr::filter( # rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount # ) #} limma.voom.convert.column <- function(dataframe, columnname) { dataframe %>% tibble::column_to_rownames(columnname) %>% limma::voom() %>% as.data.frame() %>% tibble::rownames_to_column(columnname) } select.columns.in.order <- function(dataframe, columns) { dataframe[, columns] } drop.columns.if.all.same.value <- function(dataframe) { for (name in colnames(dataframe)) { is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 if (is.all.same) { dataframe <- dataframe %>% dplyr::select( -tidyselect::one_of(name) ) } } dataframe } # Load data master.Table <- foreign::read.spss( file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") ) %>% as.data.frame() %>% dplyr::mutate( GenomeScan_ID = stringr::str_trim(GenomeScan_ID), gender = forcats::fct_recode( Gender, female = "f", male = "m", other = "" ), age = as.numeric(Age), smoking.status = forcats::fct_recode( Smoking_status, `Ex-smoker` = "ES ", `Current smoker` = "CS ", other = " " ) ) expression.data <- readr::read_tsv( file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), col_types = readr::cols() ) gene.data <- getGenedataByEnsemblId38( ensemblIds = expression.data$Gene, file.location = expression.dir ) %>% dplyr::group_by(hgnc_symbol) %>% dplyr::filter( dplyr::row_number() == 1, !is.na(hgnc_symbol), hgnc_symbol != "" ) %>% dplyr::ungroup() %>% dplyr::select( hgnc_symbol, ensembl_gene_id, chromosome_name, transcript_start, transcript_end ) master.Table <- master.Table %>% dplyr::mutate( Group_simple2 = stringr::str_trim(Group_simple2), Group_simple = stringr::str_trim(Group_simple), T_number = as.character(T_number), sample.id = stringr::str_trim(PRESTO_ID) ) %>% dplyr::filter( # # According to Niek, I should not include this, for whatever reason #!(GenomeScan_ID %in% c( # "T02-01796", # "T02-03095", # "T02-10683", # "T10-18671", # "T12-12036" # ) # ), # # # (According to Niek, don't include) Water Controls #!stringr::str_detect(T_number, pattern="Water"), # # # (According to Niek, don't include)never smoker controles #!(Group_simple2 == "NS_Ctrl"), # # # (According to Niek, don't include)ALFA1 patiƫnten #!(Group_simple == "ALFA"), # #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", GenomeScan_ID %in% colnames(expression.data), !is.na(sample.id), sample.id != "" ) expression.data <- expression.data %>% select.columns.in.order( c("Gene", master.Table$GenomeScan_ID) )