diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57ef2cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ + +.DS_Store +*.nosync +*.RData diff --git a/rnaseq/.DS_Store b/rnaseq/.DS_Store new file mode 100644 index 0000000..8933a0f Binary files /dev/null and b/rnaseq/.DS_Store differ diff --git a/rnaseq/step1_fastqc/snippet.sh b/rnaseq/step1_fastqc/snippet.sh index e69de29..0eb5213 100644 --- a/rnaseq/step1_fastqc/snippet.sh +++ b/rnaseq/step1_fastqc/snippet.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=fastqc +#SBATCH --time=0-12:00:00 +#SBATCH --ntasks=1 +#SBATCH --mem=15G +#SBATCH --qos=regular + + +set -e +set -u +set -x +set -o pipefail + + +module purge +module load Java +module load FastQC + + +dir_raw_fastq="$(pwd)/fastq/raw" +dir_fastqc="$(pwd)/fastqc" + + +[ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}" + + +# Run FastQC +files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz") +for file in $files; do + filename=$(basename "$file") + fastqc -o "$dir_fastqc" "$file" & +done + +wait diff --git a/rnaseq/step2_trim/snippet.sh b/rnaseq/step2_trim/snippet.sh index e69de29..6c4eb2e 100644 --- a/rnaseq/step2_trim/snippet.sh +++ b/rnaseq/step2_trim/snippet.sh @@ -0,0 +1,64 @@ +#!/bin/bash +#SBATCH --job-name=trimming +#SBATCH --time=2-00:00:00 +#SBATCH --ntasks=1 +#SBATCH --mem=15G +#SBATCH --qos=regular + + +set -e +set -u +set -x +set -o pipefail + + +module purge +module load TrimGalore + + +dir_raw_fastq="$(pwd)/fastq/raw" +dir_trimmed_fastq="$(pwd)/fastq/trimmed" +dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports" + +adapter_3p="TGGAATTCTCGG" # is _R1 +adapter_5p="GATCGTCGGACT" # is _R2 + + +[ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}" + + +# Trim all adapters from the sequences +while IFS=, read -r GSID Sample +do + echo ">> Executing trimming of $GSID (${#GSID})" + if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools + for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do + newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz} + if [ -f "${newFilename}" ]; then + echo ">>File exists ${newFilename}. Skipping" + else + echo ">>File: ${fqFile}" + if grep -q "_R1" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_3p + elif grep -q "_R2" <<< $(basename "$fqFile"); then + adapter_seq=$adapter_5p + fi + echo ">>Trimming with sequence ${adapter_seq}" + trim_galore \ + --adapter "$adapter_seq" \ + --length $(printf "$adapter_seq" | wc -m) \ + --output_dir "${dir_trimmed_fastq}" \ + --fastqc_args "--noextract" \ + "${fqFile}" & + fi + done + else + echo "Skipped." + fi +done < samples.csv + + +wait +mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}" +mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}" \ No newline at end of file diff --git a/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R new file mode 100644 index 0000000..09e4dff --- /dev/null +++ b/rnaseq/step6_overall_QC/01 - Principle Component Analysis.R @@ -0,0 +1,200 @@ +# Principle Component Analysis +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# PCA variables +do.center = TRUE +do.scale = FALSE + + +# The analysis +master.Table %>% readr::write_csv( + file.path(results.dir, "patient.table.csv") +) + +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Principle Component analysis +results.dir.pca <- file.path(results.dir, "principle.components") +dir.create(results.dir.pca, recursive=TRUE) + +norm.expr.data.pcs <- norm.expr.data %>% + t() %>% + stats::prcomp( + center = do.center, + scale. = do.scale + ) + +# Write summary of PCAs to files +pcs.summery <- summary(norm.expr.data.pcs) +pcs.summery$importance %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("PC.name") %>% + readr::write_csv( + file.path(results.dir.pca, "importance.csv") + ) + +pcs.summery$x %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("ensembl.id") %>% + readr::write_csv( + file.path(results.dir.pca, "values.csv") + ) + +pcs.summery$rotation %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column("sample.id") %>% + readr::write_csv( + file.path(results.dir.pca, "rotation.csv") + ) + +data.frame( + rownames = names(pcs.summery$center), + center = pcs.summery$center, + scale = pcs.summery$scale + ) %>% + readr::write_csv( + file.path(results.dir.pca, "rest.csv") + ) + +# Not saved: pcs.summery$sdev, + + + +# Plot PCAs +# https://github.com/kevinblighe/PCAtools +results.dir.pca.plot <- file.path(results.dir.pca, "img") +dir.create(results.dir.pca.plot) + +metadata <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + colnames(norm.expr.data) + ) + + +p <- pca(norm.expr.data, + metadata = metadata, + center = do.center, + scale = do.scale +) +elbow <- findElbowPoint(p$variance) +metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage') + + +png(filename = file.path(results.dir.pca.plot,"scree_plot.png"), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib") +print(screeplot(p, + axisLabSize = 12, + titleLabSize = 12, + components = getComponents(p, 1:(elbow+5)), + vline = c(elbow) + ) + + geom_label( + aes( + x = elbow + 1, + y = 50, + label = 'Elbow method', + vjust = -1, + size = 8 + ) + )) +dev.off() + + +png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib") +print(eigencorplot(p, + metavars = metavars +)) +dev.off() + + +dir.create(file.path(results.dir.pca.plot, "pairsplot")) +for (var in metavars) { + png( + filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")), + width = 1200, height = 1200, + units = "px", pointsize = 12, + type = "Xlib" + ) + print(pairsplot(p, + components = getComponents(p, c(1:(elbow+1))), + triangle = TRUE, + trianglelabSize = 12, + hline = 0, vline = 0, + pointSize = 0.4, + gridlines.major = FALSE, + gridlines.minor = FALSE, + colby = var, + title = paste0('Pairs plot: ',var), + plotaxes = TRUE + )) + dev.off() +} + + + +# Plot PCAs - old failure +pca.combinations <- combinations( + n = (elbow+1), + r = 2, + v = 1:(elbow+1), + repeats.allowed = FALSE + ) + +dir.create(file.path(results.dir.pca.plot, "biplots")) +for (var in metavars) { + for (i in 1:nrow(pca.combinations)) { + pca.combi <- pca.combinations[i,] + pca.title <- paste(paste0("PC", pca.combi), collapse="_") + png( + filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")), + width = 800, height = 800, + units = "px", pointsize = 12, + type = "Xlib" + ) + print( + autoplot( + norm.expr.data.pcs, + data = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + tibble::column_to_rownames("GenomeScan_ID") %>% + select.rows.in.order( + rownames(norm.expr.data.pcs$x) + ), + x = pca.combi[1], + y = pca.combi[2], + colour = var, + loadings = FALSE, + loadings.label = FALSE, + #label = FALSE, + label.size = 3 + ) + + ggprism::theme_prism() + + #ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")")) + ) + dev.off() + } +} + diff --git a/rnaseq/step6_overall_QC/02 - Gender Check.R b/rnaseq/step6_overall_QC/02 - Gender Check.R new file mode 100644 index 0000000..af7536d --- /dev/null +++ b/rnaseq/step6_overall_QC/02 - Gender Check.R @@ -0,0 +1,334 @@ +# Gender QC +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +x.genes <- gene.data %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl_gene_id) + +y.genes <- gene.data %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl_gene_id) + +# Gender QC +results.dir.gender <- file.path(results.dir, "gender.check") +dir.create(results.dir.gender, recursive=TRUE) + +# Differential Expression on Gender +gender.qc.patients <- master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + gender = factor(gender, levels = c("male", "female")), + age = as.numeric(age), + smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) + ) %>% + dplyr::filter( + !is.na(gender) + ) +gender.qc.sample.order <- gender.qc.patients %>% + dplyr::pull(GenomeScan_ID) + +gender.qc.expression.data <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + select.columns.in.order(gender.qc.sample.order) %>% + as.matrix() + +design <- model.matrix( ~0 + gender, data = gender.qc.patients) + +DGEL <- edgeR::DGEList(gender.qc.expression.data) +keep <- edgeR::filterByExpr(DGEL) +keep[names(keep) %in% x.genes] <- TRUE +keep[names(keep) %in% y.genes] <- TRUE +DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] +DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") + +DGEL <- edgeR::estimateDisp(DGEL, design) +fit <- edgeR::glmQLFit(DGEL,design) + +contrasts <- limma::makeContrasts( + gender = gendermale - genderfemale, + levels = design +) +qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) + +gender.qc.results <- edgeR::topTags( + qlf, + n=nrow(DGEL) + )$table %>% + tibble::rownames_to_column("ensembl.id") %>% + dplyr::left_join( + y = gene.data, + by = c("ensembl.id" = "ensembl_gene_id") + ) %>% + readr::write_csv( + file.path(results.dir.gender, "differential.expression.on.gender.csv") + ) + +# Plotting of gender expression +results.dir.gender.plot <- file.path(results.dir.gender, "img") +dir.create(results.dir.gender.plot, recursive=TRUE) + +gender.qc.genes.to.plot <- gender.qc.results %>% + dplyr::arrange(PValue) %>% + dplyr::group_by(chromosome_name) %>% + dplyr::filter( + ( + chromosome_name %in% c("X", "Y") & + FDR < 0.05 & + dplyr::row_number() <= 5 + ) | + hgnc_symbol %in% c( + "XIST", + "TSIX", + "KDM6A", + "ZFX", + "KDM5C", + "ZFY-AS1", + "ARSDP1", + "GYG2P1", + "RBMY2JP", + "ARSLP1" + ) + ) + +gender.qc.data <- as.data.frame(norm.expr.data) %>% + rownames_to_column("ensembl.id") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "expr.value", + -ensembl.id + ) %>% + dplyr::filter( + ensembl.id %in% gender.qc.genes.to.plot$ensembl.id + ) %>% + dplyr::left_join( + y = gender.qc.patients, + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + #dplyr::filter( + # !is.na(gender) + #) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.voom.csv") + ) + +for (chr in gender.qc.genes.to.plot$chromosome_name) { + current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == chr) + chromosome_name <- chr + i <- 0 + for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) { + i <- i + 1 + + hgnc_symbol <- gene.data %>% + dplyr::filter( + ensembl_gene_id == current.ensembl.id + ) %>% + dplyr::pull(hgnc_symbol) + + # calculate outliers, kinda + plot.data <- gender.qc.data %>% + dplyr::filter( + ensembl.id == current.ensembl.id + ) %>% + dplyr::mutate( + gender = dplyr::case_when( + is.na(gender) | (stringr::str_trim(gender) == "") ~ "other", + TRUE ~ gender + ) + ) + + if (nrow(plot.data) <= 0) { + next + } + + outliers <- boxplot( + formula = expr.value ~ gender, + data = plot.data, + plot = FALSE + )$out + + result.to.annotate <- plot.data %>% + dplyr::filter( + expr.value %in% outliers + ) + + # Visual: plot range (for t-test p-value) + plot.y.range <- c( + "min" = as.integer(min(plot.data$expr.value) - 1) , + "max" = as.integer(max(plot.data$expr.value) + 1) + ) + plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05) + plot.y.range["min"] <- plot.y.range["min"] - plot.margin + plot.y.range["max"] <- plot.y.range["max"] + plot.margin + + # Plot the damn thing as if it is Graphpad Prism + stat.table <- rstatix::t_test(plot.data, expr.value ~ gender) + plt <- plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gender, + y = expr.value + ) + ) + + ggplot2::geom_jitter( + mapping = ggplot2::aes( + colour = gender, + shape = gender + ), + width = 0.1 + ) + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggplot2::stat_summary( + fun = "mean", + geom = "crossbar", + width = 0.3, + size = 0.3 + ) + + ggplot2::scale_y_continuous( + limits = plot.y.range, + guide = "prism_offset" + ) + + #ggprism::add_pvalue( + # stat.table, + # y.position = plot.y.range["max"] + #) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"), + x = "Gender", + y = "Normalised Expression Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) + } +} + +# Let's try a GSVA +gsva.groups <- list( + X = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "X") %>% + dplyr::pull(ensembl.id), + Y = gender.qc.genes.to.plot %>% + dplyr::filter(chromosome_name == "Y") %>% + dplyr::pull(ensembl.id) +) + +gsva_res = GSVA::gsva( + norm.expr.data, + gsva.groups, + mx.diff = TRUE, + verbose = FALSE, + parallel.sz = 1 +) + + +gender.qc.gsva.data <- as.data.frame(gsva_res) %>% + rownames_to_column("gsva.group") %>% + tidyr::gather( + key = "rna.seq.sample.id", + value = "gsva.value", + -gsva.group + ) %>% + dplyr::left_join( + y = gender.qc.patients %>% + dplyr::select( + GenomeScan_ID, + sample.id, + gender + ), + by = c("rna.seq.sample.id" = "GenomeScan_ID") + ) %>% + readr::write_csv( + file.path(results.dir.gender.plot, "plot.data.gsva.csv") + ) + + +for (c.gender in unique(gender.qc.gsva.data$gender)) { + if (is.na(c.gender)) { + next + } + + c.plot.data <- gender.qc.gsva.data %>% + dplyr::filter( + gender == c.gender + ) + + outliers <- boxplot( + formula = gsva.value ~ gsva.group, + data = c.plot.data, + plot = FALSE + )$out + + result.to.annotate <- c.plot.data %>% + dplyr::filter( + gsva.value %in% outliers + ) + + plt <- c.plot.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = gsva.group, + y = gsva.value + ) + ) + + ggplot2::geom_boxplot() + + ggrepel::geom_text_repel( + data = result.to.annotate, + mapping = ggplot2::aes( + label = sample.id + ), + size = 2, + box.padding = unit(0.35, "lines"), + point.padding = unit(0.3, "lines") + ) + + ggprism::theme_prism() + + ggprism::scale_colour_prism() + + ggprism::scale_shape_prism() + + ggplot2::theme( + legend.position = "none" + ) + + ggplot2::labs( + subtitle = paste0("", toupper(c.gender)), + x = "Chromosome", + y = "GSVA Values" + ) + + ggplot2::ggsave( + filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")), + plot = plt, + width = 12.5, + height = 12.5, + unit = "cm" + ) +} diff --git a/rnaseq/step6_overall_QC/03 - Sample Counts.R b/rnaseq/step6_overall_QC/03 - Sample Counts.R new file mode 100644 index 0000000..90e805a --- /dev/null +++ b/rnaseq/step6_overall_QC/03 - Sample Counts.R @@ -0,0 +1,163 @@ +# Total counts per sample +# Normalized with limma::voom + +source("__ - Preloader.R", verbose=T) + + +# The analysis +norm.expr.data <- expression.data %>% + tibble::column_to_rownames("Gene") +norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% + limma::voom() %>% + as.matrix() + +# Total counts per sample +total.count.per.sample <- expression.data %>% + tibble::column_to_rownames("Gene") %>% + colSums() + +data.frame( + sample = names(total.count.per.sample), + counts = as.numeric(total.count.per.sample) + ) %>% + readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) + + +norm.data <- norm.expr.data %>% + as.data.frame() %>% + tibble::rownames_to_column( + "Gene" + ) %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +norm.plot <- norm.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::labs( + title = "Normalized expression values distribution", + y = "Normalized expression values (limma::voom)", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.normalised.png"), + plot = norm.plot, + width = 40, + height = 20, + units = "cm" +) + + + +expr.data <- expression.data %>% + tidyr::gather( + key = "sample.id", + value = "expr.value", + -Gene + ) %>% + dplyr::filter( + expr.value != 0 + ) %>% + dplyr::left_join( + y = master.Table %>% + dplyr::filter( + !is.na(GenomeScan_ID) + ) %>% + dplyr::mutate( + id = dplyr::case_when( + stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()), + TRUE ~ sample.id + ), + gender = dplyr::case_when( + stringr::str_trim(gender) == "" ~ "water", + !is.na(gender) ~ as.character(gender) + ) + ) %>% + dplyr::select( + GenomeScan_ID, + gender, + id + ), + by = c("sample.id" = "GenomeScan_ID") + ) + +expr.plot <- expr.data %>% + ggplot2::ggplot( + mapping = ggplot2::aes( + x = id, + y = expr.value, + fill = gender + ) + ) + + ggplot2::geom_boxplot() + + ggplot2::scale_fill_manual( + values = c( + "male" = "blue", + "female" = "red", + "water" = "green" + ) + ) + + ggplot2::scale_y_continuous(trans='log2') + + ggplot2::labs( + title = "Raw expression values distribution, without zero's", + y = "Expression values", + x = "Sample", + gender = "Gender" + ) + + ggprism::theme_prism() + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 90) + ) + +ggplot2::ggsave( + filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"), + plot = expr.plot, + width = 40, + height = 20, + units = "cm" +) + diff --git a/rnaseq/step6_overall_QC/__ - Preloader.R b/rnaseq/step6_overall_QC/__ - Preloader.R new file mode 100644 index 0000000..ce96a0f --- /dev/null +++ b/rnaseq/step6_overall_QC/__ - Preloader.R @@ -0,0 +1,189 @@ +library(tidyverse) +library(ggfortify) +library(ggprism) +library(limma) +library(biomaRt) +library(PCAtools) +library(gtools) +library(edgeR) +library(ggprism) +library(foreign) + + +# Global variables +results.dir <- file.path("results.nosync", "RNA-Seq QC") +data.dir <- "Data" +patient.dir <- file.path(data.dir, "Patients") +sample.dir <- file.path(data.dir, "Samples") +expression.dir <- file.path(data.dir, "mRNA - RNA-Seq") + + +dir.create(results.dir, recursive = TRUE) + + +#### +# Helper functions +#### +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +select.rows.in.order <- function(dataframe, rows) { + dataframe[rows,] +} + +getGenedataByEnsemblId38 <- function(ensemblIds, file.location) { + file.name <- file.path(file.location, "genes_info_hg38.csv") + if (!file.exists(file.name)) { + if (!("mart" %in% ls())) { + assign("mart", useEnsembl( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl" + )) + } + gene.list <- getBM( + filters = "ensembl_gene_id", + attributes = c( + "hgnc_symbol", + "ensembl_gene_id", + "ensembl_transcript_id", + "chromosome_name", + "start_position", + "end_position", + "strand", + "transcription_start_site", + "transcript_start", + "transcript_end", + "external_gene_name" + ), + values = as.character(ensemblIds), + mart = mart + ) + readr::write_csv(gene.list, path = file.name) + } + return( + readr::read_csv( + file.name, + col_types = readr::cols() + ) + ) +} + +#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) { +# dataframe %>% +# dplyr::filter( +# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount +# ) +#} + +limma.voom.convert.column <- function(dataframe, columnname) { + dataframe %>% + tibble::column_to_rownames(columnname) %>% + limma::voom() %>% + as.data.frame() %>% + tibble::rownames_to_column(columnname) +} + +select.columns.in.order <- function(dataframe, columns) { + dataframe[, columns] +} + +drop.columns.if.all.same.value <- function(dataframe) { + for (name in colnames(dataframe)) { + is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1 + if (is.all.same) { + dataframe <- dataframe %>% + dplyr::select( + -tidyselect::one_of(name) + ) + } + } + dataframe +} + + + +# Load data +master.Table <- foreign::read.spss( + file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav") + ) %>% + as.data.frame() %>% + dplyr::mutate( + GenomeScan_ID = stringr::str_trim(GenomeScan_ID), + gender = forcats::fct_recode( + Gender, + female = "f", + male = "m", + other = "" + ), + age = as.numeric(Age), + smoking.status = forcats::fct_recode( + Smoking_status, + `Ex-smoker` = "ES ", + `Current smoker` = "CS ", + other = " " + ) + ) + +expression.data <- readr::read_tsv( + file.path(expression.dir, "20200427_103972-001_rawcounts.txt"), + col_types = readr::cols() + ) + +gene.data <- getGenedataByEnsemblId38( + ensemblIds = expression.data$Gene, + file.location = expression.dir + ) %>% + dplyr::group_by(hgnc_symbol) %>% + dplyr::filter( + dplyr::row_number() == 1, + !is.na(hgnc_symbol), + hgnc_symbol != "" + ) %>% + dplyr::ungroup() %>% + dplyr::select( + hgnc_symbol, + ensembl_gene_id, + chromosome_name, + transcript_start, + transcript_end + ) + + +master.Table <- master.Table %>% + dplyr::mutate( + Group_simple2 = stringr::str_trim(Group_simple2), + Group_simple = stringr::str_trim(Group_simple), + T_number = as.character(T_number), + sample.id = stringr::str_trim(PRESTO_ID) + ) %>% + dplyr::filter( + # # According to Niek, I should not include this, for whatever reason + #!(GenomeScan_ID %in% c( + # "T02-01796", + # "T02-03095", + # "T02-10683", + # "T10-18671", + # "T12-12036" + # ) + # ), + # + # # (According to Niek, don't include) Water Controls + #!stringr::str_detect(T_number, pattern="Water"), + # + # # (According to Niek, don't include)never smoker controles + #!(Group_simple2 == "NS_Ctrl"), + # + # # (According to Niek, don't include)ALFA1 patiƫnten + #!(Group_simple == "ALFA"), + # + #stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y", + GenomeScan_ID %in% colnames(expression.data), + !is.na(sample.id), + sample.id != "" + ) + +expression.data <- expression.data %>% + select.columns.in.order( + c("Gene", master.Table$GenomeScan_ID) + ) diff --git a/rnaseq/step6_overall_QC/snippet.R b/rnaseq/step6_overall_QC/snippet.R deleted file mode 100644 index e69de29..0000000