fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.

2021-02-10 13:42:51 +01:00
parent 135f231c86
commit 648cd09a0e
6 changed files with 64 additions and 813 deletions
--- a/rnaseq/step6_overall_QC/02
+++ b/rnaseq/step6_overall_QC/02
@@ -1,16 +1,28 @@
 # Gender QC
 # Normalized with limma::voom
+library(GSVA)
+library(limma)
+library(edgeR)
+library(tidyverse)

-source("__ - Preloader.R", verbose=T)
+
+# expression.data. Has columns:
+# - Gene (gene identifier)
+# - [Sample identifiers]
+expression.data <- "mrna_counts_table.csv"
+# master.Table. Has columns:
+# - GenomeScan_ID
+# - gender, levels = c("male", "female")
+# - age
+# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
+master.Table <- "patient_table.csv"
+# results.dir. The directory of where to put the resulting tables (and later your plots.)
+results.dir <- "results"


 # The analysis
-norm.expr.data <- expression.data %>%
-    tibble::column_to_rownames("Gene")
-norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
-    limma::voom() %>%
-    as.matrix()
-
+# We first do a differential expression analysis on gender using EdgeR.
+# Afterwards you should plot these results.
 x.genes <- gene.data %>%
    dplyr::filter(chromosome_name == "X") %>%
    dplyr::pull(ensembl_gene_id)
@@ -87,248 +99,12 @@ gender.qc.genes.to.plot <- gender.qc.results %>%
            chromosome_name %in% c("X", "Y") &
            FDR < 0.05 &
            dplyr::row_number() <= 5
-        ) |
-        hgnc_symbol %in% c(
-            "XIST",
-            "TSIX",
-            "KDM6A",
-            "ZFX",
-            "KDM5C",
-            "ZFY-AS1",
-            "ARSDP1",
-            "GYG2P1",
-            "RBMY2JP",
-            "ARSLP1"
        )
    )

-gender.qc.data <- as.data.frame(norm.expr.data) %>%
-    rownames_to_column("ensembl.id") %>%
-    tidyr::gather(
-        key = "rna.seq.sample.id",
-        value = "expr.value",
-        -ensembl.id
-    ) %>%
-    dplyr::filter(
-        ensembl.id %in% gender.qc.genes.to.plot$ensembl.id
-    ) %>%
-    dplyr::left_join(
-        y = gender.qc.patients,
-        by = c("rna.seq.sample.id" = "GenomeScan_ID")
-    ) %>%
-    #dplyr::filter(
-    #    !is.na(gender)
-    #) %>%
-    readr::write_csv(
-        file.path(results.dir.gender.plot, "plot.data.voom.csv")
-    )

-for (chr in gender.qc.genes.to.plot$chromosome_name) {
-    current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>%
-        dplyr::filter(chromosome_name == chr)
-    chromosome_name <- chr
-    i <- 0
-    for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) {
-        i <- i + 1
-
-        hgnc_symbol <- gene.data %>%
-            dplyr::filter(
-                ensembl_gene_id == current.ensembl.id
-            ) %>%
-            dplyr::pull(hgnc_symbol)
-
-        # calculate outliers, kinda
-        plot.data <- gender.qc.data %>%
-            dplyr::filter(
-                ensembl.id == current.ensembl.id
-            ) %>%
-            dplyr::mutate(
-                gender = dplyr::case_when(
-                    is.na(gender) | (stringr::str_trim(gender) == "") ~ "other",
-                    TRUE ~ gender
-                )
-            )
-
-        if (nrow(plot.data) <= 0) {
-            next
-        }
-
-        outliers <- boxplot(
-            formula = expr.value ~ gender,
-            data = plot.data,
-            plot = FALSE
-        )$out
-
-        result.to.annotate <- plot.data %>%
-            dplyr::filter(
-                expr.value %in% outliers
-            )
-
-        # Visual: plot range (for t-test p-value)
-        plot.y.range <- c(
-            "min" = as.integer(min(plot.data$expr.value) - 1) ,
-            "max" = as.integer(max(plot.data$expr.value) + 1)
-        )
-        plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05)
-        plot.y.range["min"] <- plot.y.range["min"] - plot.margin
-        plot.y.range["max"] <- plot.y.range["max"] + plot.margin
-
-        # Plot the damn thing as if it is Graphpad Prism
-        stat.table <- rstatix::t_test(plot.data, expr.value ~ gender)
-        plt <- plot.data %>%
-            ggplot2::ggplot(
-                mapping = ggplot2::aes(
-                    x = gender,
-                    y = expr.value
-                ) 
-            ) +
-            ggplot2::geom_jitter(
-                mapping = ggplot2::aes(
-                    colour = gender,
-                    shape = gender
-                ),
-                width = 0.1
-            ) +
-            ggrepel::geom_text_repel(
-                data = result.to.annotate,
-                mapping = ggplot2::aes(
-                    label = sample.id
-                ),
-                size = 2,
-                box.padding = unit(0.35, "lines"),
-                point.padding = unit(0.3, "lines")
-            ) +
-              ggplot2::stat_summary(
-                  fun = "mean",
-                  geom = "crossbar",
-                  width = 0.3,
-                  size = 0.3
-              ) + 
-            ggplot2::scale_y_continuous(
-                limits = plot.y.range,
-                guide = "prism_offset"
-            ) +
-            #ggprism::add_pvalue(
-            #    stat.table,
-            #    y.position = plot.y.range["max"]
-            #) +
-            ggprism::theme_prism() + 
-            ggprism::scale_colour_prism() + 
-            ggprism::scale_shape_prism() +
-            ggplot2::theme(
-                legend.position = "none"
-            ) + 
-            ggplot2::labs(
-                subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"),
-                x = "Gender",
-                y = "Normalised Expression Values"
-            )
-
-        ggplot2::ggsave(
-            filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")),
-            plot = plt,
-            width = 12.5,
-            height = 12.5,
-            unit = "cm"
-        )
-    }
-}
-
-# Let's try a GSVA
-gsva.groups <- list(
-    X = gender.qc.genes.to.plot %>%
-        dplyr::filter(chromosome_name == "X") %>%
-        dplyr::pull(ensembl.id),
-    Y = gender.qc.genes.to.plot %>%
-        dplyr::filter(chromosome_name == "Y") %>%
-        dplyr::pull(ensembl.id)
-)
-
-gsva_res = GSVA::gsva(
-  norm.expr.data,
-  gsva.groups,
-  mx.diff = TRUE,
-  verbose = FALSE,
-  parallel.sz = 1
-)
-
-
-gender.qc.gsva.data <- as.data.frame(gsva_res) %>%
-    rownames_to_column("gsva.group") %>%
-    tidyr::gather(
-        key = "rna.seq.sample.id",
-        value = "gsva.value",
-        -gsva.group
-    ) %>%
-    dplyr::left_join(
-        y = gender.qc.patients %>%
-          dplyr::select(
-              GenomeScan_ID,
-              sample.id,
-              gender
-          ),
-        by = c("rna.seq.sample.id" = "GenomeScan_ID")
-    ) %>%
-    readr::write_csv(
-        file.path(results.dir.gender.plot, "plot.data.gsva.csv")
-    )
-
-
-for (c.gender in unique(gender.qc.gsva.data$gender)) {
-    if (is.na(c.gender)) {
-        next
-    }
-
-    c.plot.data <- gender.qc.gsva.data %>%
-        dplyr::filter(
-            gender == c.gender
-        )
-
-    outliers <- boxplot(
-        formula = gsva.value ~ gsva.group,
-        data = c.plot.data,
-        plot = FALSE
-    )$out
-
-    result.to.annotate <- c.plot.data %>%
-        dplyr::filter(
-            gsva.value %in% outliers
-        )
-
-    plt <- c.plot.data %>%
-        ggplot2::ggplot(
-            mapping = ggplot2::aes(
-                x = gsva.group,
-                y = gsva.value
-            ) 
-        ) +
-        ggplot2::geom_boxplot() +
-        ggrepel::geom_text_repel(
-            data = result.to.annotate,
-            mapping = ggplot2::aes(
-                label = sample.id
-            ),
-            size = 2,
-            box.padding = unit(0.35, "lines"),
-            point.padding = unit(0.3, "lines")
-        ) +
-        ggprism::theme_prism() + 
-        ggprism::scale_colour_prism() + 
-        ggprism::scale_shape_prism() +
-        ggplot2::theme(
-            legend.position = "none"
-        ) + 
-        ggplot2::labs(
-            subtitle = paste0("", toupper(c.gender)),
-            x = "Chromosome",
-            y = "GSVA Values"
-        )
-
-    ggplot2::ggsave(
-        filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")),
-        plot = plt,
-        width = 12.5,
-        height = 12.5,
-        unit = "cm"
-    )
-}
+# Next thing to do:
+# - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender.
+# - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point.
+# - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point.
+# - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point.