fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.

2021-02-10 13:42:51 +01:00
parent 135f231c86
commit 648cd09a0e
6 changed files with 64 additions and 813 deletions
--- a/rnaseq/step1_fastqc/snippet.sh
+++ b/rnaseq/step1_fastqc/snippet.sh
@@ -1,34 +1,5 @@
-#!/bin/bash
+# The command to do a FastQC on a fastq file is
-#SBATCH --job-name=fastqc
+file="file_to_analyse.fq.gz"
-#SBATCH --time=0-12:00:00
+fastqc_out="./path/to/fastqc/output/dir"
 #SBATCH --ntasks=1
 #SBATCH --mem=15G
 #SBATCH --qos=regular
-
+fastqc -o "$fastqc_out" "$file"
 set -e
 set -u
 set -x
 set -o pipefail
 module purge
 module load Java
 module load FastQC
 dir_raw_fastq="$(pwd)/fastq/raw"
 dir_fastqc="$(pwd)/fastqc"
 [ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}"
 # Run FastQC
 files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz")
 for file in $files; do
 	filename=$(basename "$file")
 	fastqc -o "$dir_fastqc" "$file" &
 done
 wait
--- a/rnaseq/step2_trim/snippet.sh
+++ b/rnaseq/step2_trim/snippet.sh
@@ -1,64 +0,0 @@
 #!/bin/bash
 #SBATCH --job-name=trimming
 #SBATCH --time=2-00:00:00
 #SBATCH --ntasks=1
 #SBATCH --mem=15G
 #SBATCH --qos=regular
 set -e
 set -u
 set -x
 set -o pipefail
 module purge
 module load TrimGalore
 dir_raw_fastq="$(pwd)/fastq/raw"
 dir_trimmed_fastq="$(pwd)/fastq/trimmed"
 dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports"
 adapter_3p="TGGAATTCTCGG" # is _R1
 adapter_5p="GATCGTCGGACT" # is _R2
 [ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}"
 # Trim all adapters from the sequences
 while IFS=, read -r GSID Sample
 do
  echo ">> Executing trimming of $GSID (${#GSID})"
  if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools
    for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do
      newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz}
      if [ -f "${newFilename}" ]; then
        echo ">>File exists ${newFilename}. Skipping"
      else
        echo ">>File: ${fqFile}"
        if grep -q "_R1" <<< $(basename "$fqFile"); then
          adapter_seq=$adapter_3p
        elif grep -q "_R2" <<< $(basename "$fqFile"); then
          adapter_seq=$adapter_5p
        fi
        echo ">>Trimming with sequence ${adapter_seq}"
        trim_galore \
          --adapter "$adapter_seq" \
          --length $(printf "$adapter_seq" | wc -m) \
          --output_dir "${dir_trimmed_fastq}" \
          --fastqc_args "--noextract" \
          "${fqFile}" &
      fi
    done
  else
    echo "Skipped."
  fi
 done < samples.csv
 wait
 mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}"
 mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}"
 mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}"
--- a/rnaseq/step6_overall_QC/01
+++ b/rnaseq/step6_overall_QC/01
@@ -1,7 +1,15 @@
 # Principle Component Analysis
 # Normalized with limma::voom
 library(limma)
 library(tidyverse)
-source("__ - Preloader.R", verbose=T)
+
 # expression.data. Has columns:
 # - Gene (gene identifier)
 # - [Sample identifiers]
 expression.data <- "mrna_counts_table.csv"
 # results.dir. The directory of where to put the resulting tables (and later your plots.)
 results.dir <- "results"
 # PCA variables
@@ -10,10 +18,7 @@ do.scale = FALSE
 # The analysis
-master.Table %>% readr::write_csv(
+# We ise prcomp to calculate the PCAs. Afterwards you should plot the results.
 	file.path(results.dir, "patient.table.csv")
 )
 norm.expr.data <- expression.data %>%
 	tibble::column_to_rownames("Gene")
 norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
@@ -69,132 +74,8 @@ data.frame(
 # Not saved: pcs.summery$sdev,
-
+# Next thing to do:
-# Plot PCAs
+# - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance
-# https://github.com/kevinblighe/PCAtools
+# - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis
-results.dir.pca.plot <- file.path(results.dir.pca, "img")
+# - (Optional) pairsplot - plot multiple PCAs against each other in a single figure
-dir.create(results.dir.pca.plot)
+# - Plot the first couple of PCAs against each other
 metadata <- master.Table %>%
 	dplyr::filter(
 		!is.na(GenomeScan_ID)
 	) %>%
 	tibble::column_to_rownames("GenomeScan_ID") %>%
 	select.rows.in.order(
 		colnames(norm.expr.data)
 	)
 p <- pca(norm.expr.data,
 	metadata = metadata, 
  	center = do.center,
  	scale = do.scale
 )
 elbow <- findElbowPoint(p$variance)
 metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage')
 png(filename = file.path(results.dir.pca.plot,"scree_plot.png"),
    width = 800, height = 800,
    units = "px", pointsize = 12,
    type = "Xlib")
 print(screeplot(p,
 		axisLabSize = 12,
 		titleLabSize = 12,
 	    components = getComponents(p, 1:(elbow+5)),
 	    vline = c(elbow)
    ) +
    geom_label(
    	aes(
    		x = elbow + 1,
    		y = 50,
    		label = 'Elbow method',
    		vjust = -1,
    		size = 8
    	)
    ))
 dev.off()
 png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"),
    width = 1200, height = 1200,
    units = "px", pointsize = 12,
    type = "Xlib")
 print(eigencorplot(p,
    metavars = metavars
 ))
 dev.off()
 dir.create(file.path(results.dir.pca.plot, "pairsplot"))
 for (var in metavars) {
 	png(
 		filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")),
 	    width = 1200, height = 1200,
 	    units = "px", pointsize = 12,
 	    type = "Xlib"
    )
 	print(pairsplot(p,
 	    components = getComponents(p, c(1:(elbow+1))),
 	    triangle = TRUE,
 	    trianglelabSize = 12,
 	    hline = 0, vline = 0,
 	    pointSize = 0.4,
 	    gridlines.major = FALSE,
 	    gridlines.minor = FALSE,
 	    colby = var,
 	    title = paste0('Pairs plot: ',var),
 	    plotaxes = TRUE
 	))
 	dev.off()
 }
 # Plot PCAs - old failure
 pca.combinations <- combinations(
 		n = (elbow+1),
 		r = 2,
 		v = 1:(elbow+1),
 		repeats.allowed = FALSE
 	)
 dir.create(file.path(results.dir.pca.plot, "biplots"))
 for (var in metavars) {
 	for (i in 1:nrow(pca.combinations)) {
 		pca.combi <- pca.combinations[i,]
 		pca.title <- paste(paste0("PC", pca.combi), collapse="_")
 		png(
 			filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")),
 		    width = 800, height = 800,
 		    units = "px", pointsize = 12,
 		    type = "Xlib"
 	    )
 		print(
 			autoplot(
 				norm.expr.data.pcs,
 				data = master.Table %>%
 					dplyr::filter(
 						!is.na(GenomeScan_ID)
 					) %>%
 					tibble::column_to_rownames("GenomeScan_ID") %>%
 					select.rows.in.order(
 						rownames(norm.expr.data.pcs$x)
 					),
 				x = pca.combi[1],
 				y = pca.combi[2],
 				colour = var,
 				loadings = FALSE,
 				loadings.label = FALSE,
 				#label = FALSE,
 				label.size = 3
 		    ) +
 		    ggprism::theme_prism() + 
 		    #ggprism::scale_colour_prism() + 
 		    ggprism::scale_shape_prism() +
 		    ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")"))
 		)
 		dev.off()
 	}
 }
--- a/rnaseq/step6_overall_QC/02
+++ b/rnaseq/step6_overall_QC/02
@@ -1,16 +1,28 @@
 # Gender QC
 # Normalized with limma::voom
 library(GSVA)
 library(limma)
 library(edgeR)
 library(tidyverse)
-source("__ - Preloader.R", verbose=T)
+
 # expression.data. Has columns:
 # - Gene (gene identifier)
 # - [Sample identifiers]
 expression.data <- "mrna_counts_table.csv"
 # master.Table. Has columns:
 # - GenomeScan_ID
 # - gender, levels = c("male", "female")
 # - age
 # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
 master.Table <- "patient_table.csv"
 # results.dir. The directory of where to put the resulting tables (and later your plots.)
 results.dir <- "results"
 # The analysis
-norm.expr.data <- expression.data %>%
+# We first do a differential expression analysis on gender using EdgeR.
-    tibble::column_to_rownames("Gene")
+# Afterwards you should plot these results.
 norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
    limma::voom() %>%
    as.matrix()
 x.genes <- gene.data %>%
    dplyr::filter(chromosome_name == "X") %>%
    dplyr::pull(ensembl_gene_id)
@@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>%
            chromosome_name %in% c("X", "Y") &
            FDR < 0.05 &
            dplyr::row_number() <= 5
        ) |
        hgnc_symbol %in% c(
            "XIST",
            "TSIX",
            "KDM6A",
            "ZFX",
            "KDM5C",
            "ZFY-AS1",
            "ARSDP1",
            "GYG2P1",
            "RBMY2JP",
            "ARSLP1"
        )
    )
 gender.qc.data <- as.data.frame(norm.expr.data) %>%
    rownames_to_column("ensembl.id") %>%
    tidyr::gather(
        key = "rna.seq.sample.id",
        value = "expr.value",
        -ensembl.id
    ) %>%
    dplyr::filter(
        ensembl.id %in% gender.qc.genes.to.plot$ensembl.id
    ) %>%
    dplyr::left_join(
        y = gender.qc.patients,
        by = c("rna.seq.sample.id" = "GenomeScan_ID")
    ) %>%
    #dplyr::filter(
    #    !is.na(gender)
    #) %>%
    readr::write_csv(
        file.path(results.dir.gender.plot, "plot.data.voom.csv")
    )
-for (chr in gender.qc.genes.to.plot$chromosome_name) {
+# Next thing to do:
-    current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>%
+# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender.
-        dplyr::filter(chromosome_name == chr)
+# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point.
-    chromosome_name <- chr
+# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point.
-    i <- 0
+# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.
    for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) {
        i <- i + 1
        hgnc_symbol <- gene.data %>%
            dplyr::filter(
                ensembl_gene_id == current.ensembl.id
            ) %>%
            dplyr::pull(hgnc_symbol)
        # calculate outliers, kinda
        plot.data <- gender.qc.data %>%
            dplyr::filter(
                ensembl.id == current.ensembl.id
            ) %>%
            dplyr::mutate(
                gender = dplyr::case_when(
                    is.na(gender) | (stringr::str_trim(gender) == "") ~ "other",
                    TRUE ~ gender
                )
            )
        if (nrow(plot.data) <= 0) {
            next
        }
        outliers <- boxplot(
            formula = expr.value ~ gender,
            data = plot.data,
            plot = FALSE
        )$out
        result.to.annotate <- plot.data %>%
            dplyr::filter(
                expr.value %in% outliers
            )
        # Visual: plot range (for t-test p-value)
        plot.y.range <- c(
            "min" = as.integer(min(plot.data$expr.value) - 1) ,
            "max" = as.integer(max(plot.data$expr.value) + 1)
        )
        plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05)
        plot.y.range["min"] <- plot.y.range["min"] - plot.margin
        plot.y.range["max"] <- plot.y.range["max"] + plot.margin
        # Plot the damn thing as if it is Graphpad Prism
        stat.table <- rstatix::t_test(plot.data, expr.value ~ gender)
        plt <- plot.data %>%
            ggplot2::ggplot(
                mapping = ggplot2::aes(
                    x = gender,
                    y = expr.value
                ) 
            ) +
            ggplot2::geom_jitter(
                mapping = ggplot2::aes(
                    colour = gender,
                    shape = gender
                ),
                width = 0.1
            ) +
            ggrepel::geom_text_repel(
                data = result.to.annotate,
                mapping = ggplot2::aes(
                    label = sample.id
                ),
                size = 2,
                box.padding = unit(0.35, "lines"),
                point.padding = unit(0.3, "lines")
            ) +
              ggplot2::stat_summary(
                  fun = "mean",
                  geom = "crossbar",
                  width = 0.3,
                  size = 0.3
              ) + 
            ggplot2::scale_y_continuous(
                limits = plot.y.range,
                guide = "prism_offset"
            ) +
            #ggprism::add_pvalue(
            #    stat.table,
            #    y.position = plot.y.range["max"]
            #) +
            ggprism::theme_prism() + 
            ggprism::scale_colour_prism() + 
            ggprism::scale_shape_prism() +
            ggplot2::theme(
                legend.position = "none"
            ) + 
            ggplot2::labs(
                subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"),
                x = "Gender",
                y = "Normalised Expression Values"
            )
        ggplot2::ggsave(
            filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")),
            plot = plt,
            width = 12.5,
            height = 12.5,
            unit = "cm"
        )
    }
 }
 # Let's try a GSVA
 gsva.groups <- list(
    X = gender.qc.genes.to.plot %>%
        dplyr::filter(chromosome_name == "X") %>%
        dplyr::pull(ensembl.id),
    Y = gender.qc.genes.to.plot %>%
        dplyr::filter(chromosome_name == "Y") %>%
        dplyr::pull(ensembl.id)
 )
 gsva_res = GSVA::gsva(
  norm.expr.data,
  gsva.groups,
  mx.diff = TRUE,
  verbose = FALSE,
  parallel.sz = 1
 )
 gender.qc.gsva.data <- as.data.frame(gsva_res) %>%
    rownames_to_column("gsva.group") %>%
    tidyr::gather(
        key = "rna.seq.sample.id",
        value = "gsva.value",
        -gsva.group
    ) %>%
    dplyr::left_join(
        y = gender.qc.patients %>%
          dplyr::select(
              GenomeScan_ID,
              sample.id,
              gender
          ),
        by = c("rna.seq.sample.id" = "GenomeScan_ID")
    ) %>%
    readr::write_csv(
        file.path(results.dir.gender.plot, "plot.data.gsva.csv")
    )
 for (c.gender in unique(gender.qc.gsva.data$gender)) {
    if (is.na(c.gender)) {
        next
    }
    c.plot.data <- gender.qc.gsva.data %>%
        dplyr::filter(
            gender == c.gender
        )
    outliers <- boxplot(
        formula = gsva.value ~ gsva.group,
        data = c.plot.data,
        plot = FALSE
    )$out
    result.to.annotate <- c.plot.data %>%
        dplyr::filter(
            gsva.value %in% outliers
        )
    plt <- c.plot.data %>%
        ggplot2::ggplot(
            mapping = ggplot2::aes(
                x = gsva.group,
                y = gsva.value
            ) 
        ) +
        ggplot2::geom_boxplot() +
        ggrepel::geom_text_repel(
            data = result.to.annotate,
            mapping = ggplot2::aes(
                label = sample.id
            ),
            size = 2,
            box.padding = unit(0.35, "lines"),
            point.padding = unit(0.3, "lines")
        ) +
        ggprism::theme_prism() + 
        ggprism::scale_colour_prism() + 
        ggprism::scale_shape_prism() +
        ggplot2::theme(
            legend.position = "none"
        ) + 
        ggplot2::labs(
            subtitle = paste0("", toupper(c.gender)),
            x = "Chromosome",
            y = "GSVA Values"
        )
    ggplot2::ggsave(
        filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")),
        plot = plt,
        width = 12.5,
        height = 12.5,
        unit = "cm"
    )
 }
--- a/rnaseq/step6_overall_QC/03
+++ b/rnaseq/step6_overall_QC/03
@@ -1,17 +1,25 @@
 # Total counts per sample
 # Normalized with limma::voom
 library(limma)
 library(tidyverse)
-source("__ - Preloader.R", verbose=T)
+
 # expression.data. Has columns:
 # - Gene (gene identifier)
 # - [Sample identifiers]
 expression.data <- "mrna_counts_table.csv"
 # master.Table. Has columns:
 # - GenomeScan_ID
 # - gender, levels = c("male", "female")
 # - age
 # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
 master.Table <- "patient_table.csv"
 # results.dir. The directory of where to put the resulting tables (and later your plots.)
 results.dir <- "results"
 # The analysis
-norm.expr.data <- expression.data %>%
+# We calculate the number of mapped reads per sample.
 	tibble::column_to_rownames("Gene")
 norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
 	limma::voom() %>%
 	as.matrix()
 # Total counts per sample
 total.count.per.sample <- expression.data %>%
 	tibble::column_to_rownames("Gene") %>%
 	colSums()
@@ -23,141 +31,9 @@ data.frame(
 	readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv"))
-norm.data <- norm.expr.data %>%
+# Next thing to do:
-  as.data.frame() %>%
+# - Check the number of reads per sample in total.counts.per.sample.csv
-  tibble::rownames_to_column(
+# - Plot the reads distribution (all reads) per sample in a boxplot.
-    "Gene"
+# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to
-  ) %>%
+#   feature and unique mapped to no feature and plot these in a stacked bar graph.
  tidyr::gather(
    key = "sample.id",
    value = "expr.value",
    -Gene
  ) %>%
  dplyr::left_join(
    y = master.Table %>%
      dplyr::filter(
          !is.na(GenomeScan_ID)
      ) %>%
      dplyr::mutate(
          id = dplyr::case_when(
            stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
            TRUE ~ sample.id
          ),
          gender = dplyr::case_when(
            stringr::str_trim(gender) == "" ~ "water",
            !is.na(gender) ~ as.character(gender)
          )
      ) %>%
      dplyr::select(
        GenomeScan_ID,
        gender,
        id
      ),
    by = c("sample.id" = "GenomeScan_ID")
  )
 norm.plot <- norm.data %>%
  ggplot2::ggplot(
    mapping = ggplot2::aes(
      x = id,
      y = expr.value,
      fill = gender
    )
  ) +
  ggplot2::geom_boxplot() +
  ggplot2::scale_fill_manual(
    values = c(
      "male" = "blue",
      "female" = "red",
      "water" = "green"
    )
  ) +
  ggplot2::labs(
    title = "Normalized expression values distribution",
    y = "Normalized expression values (limma::voom)",
    x = "Sample",
    gender = "Gender"
  ) +
  ggprism::theme_prism() +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(angle = 90)
  )
 ggplot2::ggsave(
  filename = file.path(results.dir, "counts.per.sample.normalised.png"),
  plot = norm.plot,
  width = 40,
  height = 20,
  units = "cm"
 )
 expr.data <- expression.data %>%
  tidyr::gather(
    key = "sample.id",
    value = "expr.value",
    -Gene
  ) %>%
  dplyr::filter(
    expr.value != 0
  ) %>%
  dplyr::left_join(
    y = master.Table %>%
      dplyr::filter(
          !is.na(GenomeScan_ID)
      ) %>%
      dplyr::mutate(
          id = dplyr::case_when(
            stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
            TRUE ~ sample.id
          ),
          gender = dplyr::case_when(
            stringr::str_trim(gender) == "" ~ "water",
            !is.na(gender) ~ as.character(gender)
          )
      ) %>%
      dplyr::select(
        GenomeScan_ID,
        gender,
        id
      ),
    by = c("sample.id" = "GenomeScan_ID")
  )
 expr.plot <- expr.data %>%
  ggplot2::ggplot(
    mapping = ggplot2::aes(
      x = id,
      y = expr.value,
      fill = gender
    )
  ) +
  ggplot2::geom_boxplot() +
  ggplot2::scale_fill_manual(
    values = c(
      "male" = "blue",
      "female" = "red",
      "water" = "green"
    )
  ) +
  ggplot2::scale_y_continuous(trans='log2')  +
  ggplot2::labs(
    title = "Raw expression values distribution, without zero's",
    y = "Expression values",
    x = "Sample",
    gender = "Gender"
  ) +
  ggprism::theme_prism() +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(angle = 90)
  )
 ggplot2::ggsave(
  filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"),
  plot = expr.plot,
  width = 40,
  height = 20,
  units = "cm"
 )
--- a/rnaseq/step6_overall_QC/__
+++ b/rnaseq/step6_overall_QC/__
@@ -1,189 +0,0 @@
 library(tidyverse)
 library(ggfortify)
 library(ggprism)
 library(limma)
 library(biomaRt)
 library(PCAtools)
 library(gtools)
 library(edgeR)
 library(ggprism)
 library(foreign)
 # Global variables
 results.dir <- file.path("results.nosync", "RNA-Seq QC")
 data.dir <- "Data"
 patient.dir <- file.path(data.dir, "Patients")
 sample.dir <- file.path(data.dir, "Samples")
 expression.dir <- file.path(data.dir, "mRNA - RNA-Seq")
 dir.create(results.dir, recursive = TRUE)
 ####
 # Helper functions
 ####
 select.columns.in.order <- function(dataframe, columns) {
  dataframe[, columns]
 }
 select.rows.in.order <- function(dataframe, rows) {
  dataframe[rows,]
 }
 getGenedataByEnsemblId38 <- function(ensemblIds, file.location) {
  file.name <- file.path(file.location, "genes_info_hg38.csv")
  if (!file.exists(file.name)) {
    if (!("mart" %in% ls())) {
      assign("mart", useEnsembl(
        biomart = "ENSEMBL_MART_ENSEMBL",
        dataset = "hsapiens_gene_ensembl"
      ))
    }
    gene.list <- getBM(
      filters = "ensembl_gene_id",
      attributes = c(
        "hgnc_symbol",
        "ensembl_gene_id",
        "ensembl_transcript_id",
        "chromosome_name",
        "start_position",
        "end_position",
        "strand",
        "transcription_start_site",
        "transcript_start",
        "transcript_end",
        "external_gene_name"
      ),
      values = as.character(ensemblIds),
      mart = mart
    )
    readr::write_csv(gene.list, path = file.name)
  }
  return(
    readr::read_csv(
    	file.name,
 		col_types = readr::cols()
 	)
  )
 }
 #remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) {
 #  dataframe %>%
 #  dplyr::filter(
 #    rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount
 #  )
 #}
 limma.voom.convert.column <- function(dataframe, columnname) {
  dataframe %>%
  tibble::column_to_rownames(columnname) %>%
  limma::voom() %>%
  as.data.frame() %>%
  tibble::rownames_to_column(columnname)
 }
 select.columns.in.order <- function(dataframe, columns) {
  dataframe[, columns]
 }
 drop.columns.if.all.same.value <- function(dataframe) {
  for (name in colnames(dataframe)) {
    is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1
    if (is.all.same) {
      dataframe <- dataframe %>%
        dplyr::select(
          -tidyselect::one_of(name)
        )
    }
  }
  dataframe
 }
 # Load data
 master.Table <- foreign::read.spss(
 		file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav")
 	) %>%
  as.data.frame() %>%
  dplyr::mutate(
    GenomeScan_ID = stringr::str_trim(GenomeScan_ID),
    gender = forcats::fct_recode(
      Gender,
      female = "f",
      male = "m",
      other = ""
    ),
    age = as.numeric(Age),
    smoking.status = forcats::fct_recode(
      Smoking_status,
      `Ex-smoker` = "ES        ",
      `Current smoker` = "CS        ",
      other = "          "
    )
  )
 expression.data <- readr::read_tsv(
 		file.path(expression.dir, "20200427_103972-001_rawcounts.txt"),
 		col_types = readr::cols()
 	)
 gene.data <- getGenedataByEnsemblId38(
    ensemblIds = expression.data$Gene,
    file.location = expression.dir
  ) %>%
  dplyr::group_by(hgnc_symbol) %>%
  dplyr::filter(
    dplyr::row_number() == 1,
    !is.na(hgnc_symbol),
    hgnc_symbol != ""
  ) %>%
  dplyr::ungroup() %>%
  dplyr::select(
    hgnc_symbol,
    ensembl_gene_id,
    chromosome_name,
    transcript_start,
    transcript_end
  )
 master.Table <- master.Table %>%
  dplyr::mutate(
    Group_simple2 = stringr::str_trim(Group_simple2),
    Group_simple  = stringr::str_trim(Group_simple),
    T_number = as.character(T_number),
    sample.id = stringr::str_trim(PRESTO_ID)
  ) %>%
  dplyr::filter(
    # # According to Niek, I should not include this, for whatever reason
    #!(GenomeScan_ID %in% c(
    #    "T02-01796",
    #    "T02-03095",
    #    "T02-10683",
    #    "T10-18671",
    #    "T12-12036"
    #  )
    # ),
    # 
    # # (According to Niek, don't include) Water Controls
    #!stringr::str_detect(T_number, pattern="Water"),
    # 
    # # (According to Niek, don't include)never smoker controles
    #!(Group_simple2 == "NS_Ctrl"),
    # 
    # # (According to Niek, don't include)ALFA1 patiënten 
    #!(Group_simple == "ALFA"),
    # 
    #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y",
    GenomeScan_ID %in% colnames(expression.data),
    !is.na(sample.id),
    sample.id != ""
  )
 expression.data <- expression.data %>%
  select.columns.in.order(
    c("Gene", master.Table$GenomeScan_ID)
  )