fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.

2021-02-10 13:42:51 +01:00
parent 135f231c86
commit 648cd09a0e
6 changed files with 64 additions and 813 deletions
--- a/rnaseq/step6_overall_QC/03
+++ b/rnaseq/step6_overall_QC/03
@@ -1,17 +1,25 @@
 # Total counts per sample
 # Normalized with limma::voom
+library(limma)
+library(tidyverse)

-source("__ - Preloader.R", verbose=T)
+
+# expression.data. Has columns:
+# - Gene (gene identifier)
+# - [Sample identifiers]
+expression.data <- "mrna_counts_table.csv"
+# master.Table. Has columns:
+# - GenomeScan_ID
+# - gender, levels = c("male", "female")
+# - age
+# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
+master.Table <- "patient_table.csv"
+# results.dir. The directory of where to put the resulting tables (and later your plots.)
+results.dir <- "results"


 # The analysis
-norm.expr.data <- expression.data %>%
-	tibble::column_to_rownames("Gene")
-norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
-	limma::voom() %>%
-	as.matrix()
-
-# Total counts per sample
+# We calculate the number of mapped reads per sample.
 total.count.per.sample <- expression.data %>%
 	tibble::column_to_rownames("Gene") %>%
 	colSums()
@@ -23,141 +31,9 @@ data.frame(
 	readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv"))


-norm.data <- norm.expr.data %>%
-  as.data.frame() %>%
-  tibble::rownames_to_column(
-    "Gene"
-  ) %>%
-  tidyr::gather(
-    key = "sample.id",
-    value = "expr.value",
-    -Gene
-  ) %>%
-  dplyr::left_join(
-    y = master.Table %>%
-      dplyr::filter(
-          !is.na(GenomeScan_ID)
-      ) %>%
-      dplyr::mutate(
-          id = dplyr::case_when(
-            stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
-            TRUE ~ sample.id
-          ),
-          gender = dplyr::case_when(
-            stringr::str_trim(gender) == "" ~ "water",
-            !is.na(gender) ~ as.character(gender)
-          )
-      ) %>%
-      dplyr::select(
-        GenomeScan_ID,
-        gender,
-        id
-      ),
-    by = c("sample.id" = "GenomeScan_ID")
-  )
-
-norm.plot <- norm.data %>%
-  ggplot2::ggplot(
-    mapping = ggplot2::aes(
-      x = id,
-      y = expr.value,
-      fill = gender
-    )
-  ) +
-  ggplot2::geom_boxplot() +
-  ggplot2::scale_fill_manual(
-    values = c(
-      "male" = "blue",
-      "female" = "red",
-      "water" = "green"
-    )
-  ) +
-  ggplot2::labs(
-    title = "Normalized expression values distribution",
-    y = "Normalized expression values (limma::voom)",
-    x = "Sample",
-    gender = "Gender"
-  ) +
-  ggprism::theme_prism() +
-  ggplot2::theme(
-    axis.text.x = ggplot2::element_text(angle = 90)
-  )
-
-ggplot2::ggsave(
-  filename = file.path(results.dir, "counts.per.sample.normalised.png"),
-  plot = norm.plot,
-  width = 40,
-  height = 20,
-  units = "cm"
-)
-
-
-
-expr.data <- expression.data %>%
-  tidyr::gather(
-    key = "sample.id",
-    value = "expr.value",
-    -Gene
-  ) %>%
-  dplyr::filter(
-    expr.value != 0
-  ) %>%
-  dplyr::left_join(
-    y = master.Table %>%
-      dplyr::filter(
-          !is.na(GenomeScan_ID)
-      ) %>%
-      dplyr::mutate(
-          id = dplyr::case_when(
-            stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
-            TRUE ~ sample.id
-          ),
-          gender = dplyr::case_when(
-            stringr::str_trim(gender) == "" ~ "water",
-            !is.na(gender) ~ as.character(gender)
-          )
-      ) %>%
-      dplyr::select(
-        GenomeScan_ID,
-        gender,
-        id
-      ),
-    by = c("sample.id" = "GenomeScan_ID")
-  )
-
-expr.plot <- expr.data %>%
-  ggplot2::ggplot(
-    mapping = ggplot2::aes(
-      x = id,
-      y = expr.value,
-      fill = gender
-    )
-  ) +
-  ggplot2::geom_boxplot() +
-  ggplot2::scale_fill_manual(
-    values = c(
-      "male" = "blue",
-      "female" = "red",
-      "water" = "green"
-    )
-  ) +
-  ggplot2::scale_y_continuous(trans='log2')  +
-  ggplot2::labs(
-    title = "Raw expression values distribution, without zero's",
-    y = "Expression values",
-    x = "Sample",
-    gender = "Gender"
-  ) +
-  ggprism::theme_prism() +
-  ggplot2::theme(
-    axis.text.x = ggplot2::element_text(angle = 90)
-  )
-
-ggplot2::ggsave(
-  filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"),
-  plot = expr.plot,
-  width = 40,
-  height = 20,
-  units = "cm"
-)
+# Next thing to do:
+# - Check the number of reads per sample in total.counts.per.sample.csv
+# - Plot the reads distribution (all reads) per sample in a boxplot.
+# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to
+#   feature and unique mapped to no feature and plot these in a stacked bar graph.