diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index 0eb5213..b0d7dcb 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -1,34 +1,5 @@ -#!/bin/bash -#SBATCH --job-name=fastqc -#SBATCH --time=0-12:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular +# The command to do a FastQC on a fastq file is +file="file_to_analyse.fq.gz" +fastqc_out="./path/to/fastqc/output/dir" - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load Java -module load FastQC - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_fastqc="$(pwd)/fastqc" - - -[ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}" - - -# Run FastQC -files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz") -for file in $files; do - filename=$(basename "$file") - fastqc -o "$dir_fastqc" "$file" & -done - -wait +fastqc -o "$fastqc_out" "$file" diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index 6c4eb2e..e69de29 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -1,64 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=trimming -#SBATCH --time=2-00:00:00 -#SBATCH --ntasks=1 -#SBATCH --mem=15G -#SBATCH --qos=regular - - -set -e -set -u -set -x -set -o pipefail - - -module purge -module load TrimGalore - - -dir_raw_fastq="$(pwd)/fastq/raw" -dir_trimmed_fastq="$(pwd)/fastq/trimmed" -dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports" - -adapter_3p="TGGAATTCTCGG" # is _R1 -adapter_5p="GATCGTCGGACT" # is _R2 - - -[ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}" - - -# Trim all adapters from the sequences -while IFS=, read -r GSID Sample -do - echo ">> Executing trimming of $GSID (${#GSID})" - if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools - for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do - newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} - if [ -f "${newFilename}" ]; then - echo ">>File exists ${newFilename}. Skipping" - else - echo ">>File: ${fqFile}" - if grep -q "_R1" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_3p - elif grep -q "_R2" <<< $(basename "$fqFile"); then - adapter_seq=$adapter_5p - fi - echo ">>Trimming with sequence ${adapter_seq}" - trim_galore \ - --adapter "$adapter_seq" \ - --length $(printf "$adapter_seq" | wc -m) \ - --output_dir "${dir_trimmed_fastq}" \ - --fastqc_args "--noextract" \ - "${fqFile}" & - fi - done - else - echo "Skipped." - fi -done < samples.csv - - -wait -mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" -mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R index 09e4dff..0c42844 100644 --- a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -1,7 +1,15 @@ # Principle Component Analysis # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # PCA variables @@ -10,10 +18,7 @@ do.scale = FALSE # The analysis -master.Table %>% readr::write_csv( - file.path(results.dir, "patient.table.csv") -) - +# We ise prcomp to calculate the PCAs. Afterwards you should plot the results. norm.expr.data <- expression.data %>% tibble::column_to_rownames("Gene") norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% @@ -69,132 +74,8 @@ data.frame( # Not saved: pcs.summery$sdev, - -# Plot PCAs -# https://github.com/kevinblighe/PCAtools -results.dir.pca.plot <- file.path(results.dir.pca, "img") -dir.create(results.dir.pca.plot) - -metadata <- master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - colnames(norm.expr.data) - ) - - -p <- pca(norm.expr.data, - metadata = metadata, - center = do.center, - scale = do.scale -) -elbow <- findElbowPoint(p$variance) -metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') - - -png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib") -print(screeplot(p, - axisLabSize = 12, - titleLabSize = 12, - components = getComponents(p, 1:(elbow+5)), - vline = c(elbow) - ) + - geom_label( - aes( - x = elbow + 1, - y = 50, - label = 'Elbow method', - vjust = -1, - size = 8 - ) - )) -dev.off() - - -png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib") -print(eigencorplot(p, - metavars = metavars -)) -dev.off() - - -dir.create(file.path(results.dir.pca.plot, "pairsplot")) -for (var in metavars) { - png( - filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), - width = 1200, height = 1200, - units = "px", pointsize = 12, - type = "Xlib" - ) - print(pairsplot(p, - components = getComponents(p, c(1:(elbow+1))), - triangle = TRUE, - trianglelabSize = 12, - hline = 0, vline = 0, - pointSize = 0.4, - gridlines.major = FALSE, - gridlines.minor = FALSE, - colby = var, - title = paste0('Pairs plot: ',var), - plotaxes = TRUE - )) - dev.off() -} - - - -# Plot PCAs - old failure -pca.combinations <- combinations( - n = (elbow+1), - r = 2, - v = 1:(elbow+1), - repeats.allowed = FALSE - ) - -dir.create(file.path(results.dir.pca.plot, "biplots")) -for (var in metavars) { - for (i in 1:nrow(pca.combinations)) { - pca.combi <- pca.combinations[i,] - pca.title <- paste(paste0("PC", pca.combi), collapse="_") - png( - filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), - width = 800, height = 800, - units = "px", pointsize = 12, - type = "Xlib" - ) - print( - autoplot( - norm.expr.data.pcs, - data = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - tibble::column_to_rownames("GenomeScan_ID") %>% - select.rows.in.order( - rownames(norm.expr.data.pcs$x) - ), - x = pca.combi[1], - y = pca.combi[2], - colour = var, - loadings = FALSE, - loadings.label = FALSE, - #label = FALSE, - label.size = 3 - ) + - ggprism::theme_prism() + - #ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) - ) - dev.off() - } -} - +# Next thing to do: +# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance +# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis +# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure +# - Plot the first couple of PCAs against each other \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R index af7536d..f02f19d 100644 --- a/rnaseq/step6_overall_QC/02 - Gender Check.R +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -1,16 +1,28 @@ # Gender QC # Normalized with limma::voom +library(GSVA) +library(limma) +library(edgeR) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - +# We first do a differential expression analysis on gender using EdgeR. +# Afterwards you should plot these results. x.genes <- gene.data %>% dplyr::filter(chromosome_name == "X") %>% dplyr::pull(ensembl_gene_id) @@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>% chromosome_name %in% c("X", "Y") & FDR < 0.05 & dplyr::row_number() <= 5 - ) | - hgnc_symbol %in% c( - "XIST", - "TSIX", - "KDM6A", - "ZFX", - "KDM5C", - "ZFY-AS1", - "ARSDP1", - "GYG2P1", - "RBMY2JP", - "ARSLP1" ) ) -gender.qc.data <- as.data.frame(norm.expr.data) %>% - rownames_to_column("ensembl.id") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "expr.value", - -ensembl.id - ) %>% - dplyr::filter( - ensembl.id %in% gender.qc.genes.to.plot$ensembl.id - ) %>% - dplyr::left_join( - y = gender.qc.patients, - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - #dplyr::filter( - # !is.na(gender) - #) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.voom.csv") - ) -for (chr in gender.qc.genes.to.plot$chromosome_name) { - current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == chr) - chromosome_name <- chr - i <- 0 - for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { - i <- i + 1 - - hgnc_symbol <- gene.data %>% - dplyr::filter( - ensembl_gene_id == current.ensembl.id - ) %>% - dplyr::pull(hgnc_symbol) - - # calculate outliers, kinda - plot.data <- gender.qc.data %>% - dplyr::filter( - ensembl.id == current.ensembl.id - ) %>% - dplyr::mutate( - gender = dplyr::case_when( - is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", - TRUE ~ gender - ) - ) - - if (nrow(plot.data) <= 0) { - next - } - - outliers <- boxplot( - formula = expr.value ~ gender, - data = plot.data, - plot = FALSE - )$out - - result.to.annotate <- plot.data %>% - dplyr::filter( - expr.value %in% outliers - ) - - # Visual: plot range (for t-test p-value) - plot.y.range <- c( - "min" = as.integer(min(plot.data$expr.value) - 1) , - "max" = as.integer(max(plot.data$expr.value) + 1) - ) - plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) - plot.y.range["min"] <- plot.y.range["min"] - plot.margin - plot.y.range["max"] <- plot.y.range["max"] + plot.margin - - # Plot the damn thing as if it is Graphpad Prism - stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) - plt <- plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gender, - y = expr.value - ) - ) + - ggplot2::geom_jitter( - mapping = ggplot2::aes( - colour = gender, - shape = gender - ), - width = 0.1 - ) + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggplot2::stat_summary( - fun = "mean", - geom = "crossbar", - width = 0.3, - size = 0.3 - ) + - ggplot2::scale_y_continuous( - limits = plot.y.range, - guide = "prism_offset" - ) + - #ggprism::add_pvalue( - # stat.table, - # y.position = plot.y.range["max"] - #) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), - x = "Gender", - y = "Normalised Expression Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) - } -} - -# Let's try a GSVA -gsva.groups <- list( - X = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "X") %>% - dplyr::pull(ensembl.id), - Y = gender.qc.genes.to.plot %>% - dplyr::filter(chromosome_name == "Y") %>% - dplyr::pull(ensembl.id) -) - -gsva_res = GSVA::gsva( - norm.expr.data, - gsva.groups, - mx.diff = TRUE, - verbose = FALSE, - parallel.sz = 1 -) - - -gender.qc.gsva.data <- as.data.frame(gsva_res) %>% - rownames_to_column("gsva.group") %>% - tidyr::gather( - key = "rna.seq.sample.id", - value = "gsva.value", - -gsva.group - ) %>% - dplyr::left_join( - y = gender.qc.patients %>% - dplyr::select( - GenomeScan_ID, - sample.id, - gender - ), - by = c("rna.seq.sample.id" = "GenomeScan_ID") - ) %>% - readr::write_csv( - file.path(results.dir.gender.plot, "plot.data.gsva.csv") - ) - - -for (c.gender in unique(gender.qc.gsva.data$gender)) { - if (is.na(c.gender)) { - next - } - - c.plot.data <- gender.qc.gsva.data %>% - dplyr::filter( - gender == c.gender - ) - - outliers <- boxplot( - formula = gsva.value ~ gsva.group, - data = c.plot.data, - plot = FALSE - )$out - - result.to.annotate <- c.plot.data %>% - dplyr::filter( - gsva.value %in% outliers - ) - - plt <- c.plot.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = gsva.group, - y = gsva.value - ) - ) + - ggplot2::geom_boxplot() + - ggrepel::geom_text_repel( - data = result.to.annotate, - mapping = ggplot2::aes( - label = sample.id - ), - size = 2, - box.padding = unit(0.35, "lines"), - point.padding = unit(0.3, "lines") - ) + - ggprism::theme_prism() + - ggprism::scale_colour_prism() + - ggprism::scale_shape_prism() + - ggplot2::theme( - legend.position = "none" - ) + - ggplot2::labs( - subtitle = paste0("", toupper(c.gender)), - x = "Chromosome", - y = "GSVA Values" - ) - - ggplot2::ggsave( - filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), - plot = plt, - width = 12.5, - height = 12.5, - unit = "cm" - ) -} +# Next thing to do: +# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. +# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. +# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. +# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R index 90e805a..3f4c5c2 100644 --- a/rnaseq/step6_overall_QC/03 - Sample Counts.R +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -1,17 +1,25 @@ # Total counts per sample # Normalized with limma::voom +library(limma) +library(tidyverse) -source("__ - Preloader.R", verbose=T) + +# expression.data. Has columns: +# - Gene (gene identifier) +# - [Sample identifiers] +expression.data <- "mrna_counts_table.csv" +# master.Table. Has columns: +# - GenomeScan_ID +# - gender, levels = c("male", "female") +# - age +# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) +master.Table <- "patient_table.csv" +# results.dir. The directory of where to put the resulting tables (and later your plots.) +results.dir <- "results" # The analysis -norm.expr.data <- expression.data %>% - tibble::column_to_rownames("Gene") -norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% - limma::voom() %>% - as.matrix() - -# Total counts per sample +# We calculate the number of mapped reads per sample. total.count.per.sample <- expression.data %>% tibble::column_to_rownames("Gene") %>% colSums() @@ -23,141 +31,9 @@ data.frame( readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) -norm.data <- norm.expr.data %>% - as.data.frame() %>% - tibble::rownames_to_column( - "Gene" - ) %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -norm.plot <- norm.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::labs( - title = "Normalized expression values distribution", - y = "Normalized expression values (limma::voom)", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.normalised.png"), - plot = norm.plot, - width = 40, - height = 20, - units = "cm" -) - - - -expr.data <- expression.data %>% - tidyr::gather( - key = "sample.id", - value = "expr.value", - -Gene - ) %>% - dplyr::filter( - expr.value != 0 - ) %>% - dplyr::left_join( - y = master.Table %>% - dplyr::filter( - !is.na(GenomeScan_ID) - ) %>% - dplyr::mutate( - id = dplyr::case_when( - stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), - TRUE ~ sample.id - ), - gender = dplyr::case_when( - stringr::str_trim(gender) == "" ~ "water", - !is.na(gender) ~ as.character(gender) - ) - ) %>% - dplyr::select( - GenomeScan_ID, - gender, - id - ), - by = c("sample.id" = "GenomeScan_ID") - ) - -expr.plot <- expr.data %>% - ggplot2::ggplot( - mapping = ggplot2::aes( - x = id, - y = expr.value, - fill = gender - ) - ) + - ggplot2::geom_boxplot() + - ggplot2::scale_fill_manual( - values = c( - "male" = "blue", - "female" = "red", - "water" = "green" - ) - ) + - ggplot2::scale_y_continuous(trans='log2') + - ggplot2::labs( - title = "Raw expression values distribution, without zero's", - y = "Expression values", - x = "Sample", - gender = "Gender" - ) + - ggprism::theme_prism() + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 90) - ) - -ggplot2::ggsave( - filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), - plot = expr.plot, - width = 40, - height = 20, - units = "cm" -) +# Next thing to do: +# - Check the number of reads per sample in total.counts.per.sample.csv +# - Plot the reads distribution (all reads) per sample in a boxplot. +# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to +# feature and unique mapped to no feature and plot these in a stacked bar graph. diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R deleted file mode 100644 index ce96a0f..0000000 --- a/rnaseq/step6_overall_QC/__ - Preloader.R +++ /dev/null @@ -1,189 +0,0 @@ -library(tidyverse) -library(ggfortify) -library(ggprism) -library(limma) -library(biomaRt) -library(PCAtools) -library(gtools) -library(edgeR) -library(ggprism) -library(foreign) - - -# Global variables -results.dir <- file.path("results.nosync", "RNA-Seq QC") -data.dir <- "Data" -patient.dir <- file.path(data.dir, "Patients") -sample.dir <- file.path(data.dir, "Samples") -expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") - - -dir.create(results.dir, recursive = TRUE) - - -#### -# Helper functions -#### -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -select.rows.in.order <- function(dataframe, rows) { - dataframe[rows,] -} - -getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { - file.name <- file.path(file.location, "genes_info_hg38.csv") - if (!file.exists(file.name)) { - if (!("mart" %in% ls())) { - assign("mart", useEnsembl( - biomart = "ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl" - )) - } - gene.list <- getBM( - filters = "ensembl_gene_id", - attributes = c( - "hgnc_symbol", - "ensembl_gene_id", - "ensembl_transcript_id", - "chromosome_name", - "start_position", - "end_position", - "strand", - "transcription_start_site", - "transcript_start", - "transcript_end", - "external_gene_name" - ), - values = as.character(ensemblIds), - mart = mart - ) - readr::write_csv(gene.list, path = file.name) - } - return( - readr::read_csv( - file.name, - col_types = readr::cols() - ) - ) -} - -#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { -# dataframe %>% -# dplyr::filter( -# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount -# ) -#} - -limma.voom.convert.column <- function(dataframe, columnname) { - dataframe %>% - tibble::column_to_rownames(columnname) %>% - limma::voom() %>% - as.data.frame() %>% - tibble::rownames_to_column(columnname) -} - -select.columns.in.order <- function(dataframe, columns) { - dataframe[, columns] -} - -drop.columns.if.all.same.value <- function(dataframe) { - for (name in colnames(dataframe)) { - is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 - if (is.all.same) { - dataframe <- dataframe %>% - dplyr::select( - -tidyselect::one_of(name) - ) - } - } - dataframe -} - - - -# Load data -master.Table <- foreign::read.spss( - file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") - ) %>% - as.data.frame() %>% - dplyr::mutate( - GenomeScan_ID = stringr::str_trim(GenomeScan_ID), - gender = forcats::fct_recode( - Gender, - female = "f", - male = "m", - other = "" - ), - age = as.numeric(Age), - smoking.status = forcats::fct_recode( - Smoking_status, - `Ex-smoker` = "ES ", - `Current smoker` = "CS ", - other = " " - ) - ) - -expression.data <- readr::read_tsv( - file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), - col_types = readr::cols() - ) - -gene.data <- getGenedataByEnsemblId38( - ensemblIds = expression.data$Gene, - file.location = expression.dir - ) %>% - dplyr::group_by(hgnc_symbol) %>% - dplyr::filter( - dplyr::row_number() == 1, - !is.na(hgnc_symbol), - hgnc_symbol != "" - ) %>% - dplyr::ungroup() %>% - dplyr::select( - hgnc_symbol, - ensembl_gene_id, - chromosome_name, - transcript_start, - transcript_end - ) - - -master.Table <- master.Table %>% - dplyr::mutate( - Group_simple2 = stringr::str_trim(Group_simple2), - Group_simple = stringr::str_trim(Group_simple), - T_number = as.character(T_number), - sample.id = stringr::str_trim(PRESTO_ID) - ) %>% - dplyr::filter( - # # According to Niek, I should not include this, for whatever reason - #!(GenomeScan_ID %in% c( - # "T02-01796", - # "T02-03095", - # "T02-10683", - # "T10-18671", - # "T12-12036" - # ) - # ), - # - # # (According to Niek, don't include) Water Controls - #!stringr::str_detect(T_number, pattern="Water"), - # - # # (According to Niek, don't include)never smoker controles - #!(Group_simple2 == "NS_Ctrl"), - # - # # (According to Niek, don't include)ALFA1 patiƫnten - #!(Group_simple == "ALFA"), - # - #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", - GenomeScan_ID %in% colnames(expression.data), - !is.na(sample.id), - sample.id != "" - ) - -expression.data <- expression.data %>% - select.columns.in.order( - c("Gene", master.Table$GenomeScan_ID) - )