fastqc: simple comment. trimming: reverted back. overall qc: simplified the scripts and made sure to add instructions.

This commit is contained in:
Jos van Nijnatten
2021-02-10 13:42:51 +01:00
parent 135f231c86
commit 648cd09a0e
6 changed files with 64 additions and 813 deletions

View File

@@ -1,17 +1,25 @@
# Total counts per sample
# Normalized with limma::voom
library(limma)
library(tidyverse)
source("__ - Preloader.R", verbose=T)
# expression.data. Has columns:
# - Gene (gene identifier)
# - [Sample identifiers]
expression.data <- "mrna_counts_table.csv"
# master.Table. Has columns:
# - GenomeScan_ID
# - gender, levels = c("male", "female")
# - age
# - factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
master.Table <- "patient_table.csv"
# results.dir. The directory of where to put the resulting tables (and later your plots.)
results.dir <- "results"
# The analysis
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
# Total counts per sample
# We calculate the number of mapped reads per sample.
total.count.per.sample <- expression.data %>%
tibble::column_to_rownames("Gene") %>%
colSums()
@@ -23,141 +31,9 @@ data.frame(
readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv"))
norm.data <- norm.expr.data %>%
as.data.frame() %>%
tibble::rownames_to_column(
"Gene"
) %>%
tidyr::gather(
key = "sample.id",
value = "expr.value",
-Gene
) %>%
dplyr::left_join(
y = master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
dplyr::mutate(
id = dplyr::case_when(
stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
TRUE ~ sample.id
),
gender = dplyr::case_when(
stringr::str_trim(gender) == "" ~ "water",
!is.na(gender) ~ as.character(gender)
)
) %>%
dplyr::select(
GenomeScan_ID,
gender,
id
),
by = c("sample.id" = "GenomeScan_ID")
)
norm.plot <- norm.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = id,
y = expr.value,
fill = gender
)
) +
ggplot2::geom_boxplot() +
ggplot2::scale_fill_manual(
values = c(
"male" = "blue",
"female" = "red",
"water" = "green"
)
) +
ggplot2::labs(
title = "Normalized expression values distribution",
y = "Normalized expression values (limma::voom)",
x = "Sample",
gender = "Gender"
) +
ggprism::theme_prism() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 90)
)
ggplot2::ggsave(
filename = file.path(results.dir, "counts.per.sample.normalised.png"),
plot = norm.plot,
width = 40,
height = 20,
units = "cm"
)
expr.data <- expression.data %>%
tidyr::gather(
key = "sample.id",
value = "expr.value",
-Gene
) %>%
dplyr::filter(
expr.value != 0
) %>%
dplyr::left_join(
y = master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
dplyr::mutate(
id = dplyr::case_when(
stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
TRUE ~ sample.id
),
gender = dplyr::case_when(
stringr::str_trim(gender) == "" ~ "water",
!is.na(gender) ~ as.character(gender)
)
) %>%
dplyr::select(
GenomeScan_ID,
gender,
id
),
by = c("sample.id" = "GenomeScan_ID")
)
expr.plot <- expr.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = id,
y = expr.value,
fill = gender
)
) +
ggplot2::geom_boxplot() +
ggplot2::scale_fill_manual(
values = c(
"male" = "blue",
"female" = "red",
"water" = "green"
)
) +
ggplot2::scale_y_continuous(trans='log2') +
ggplot2::labs(
title = "Raw expression values distribution, without zero's",
y = "Expression values",
x = "Sample",
gender = "Gender"
) +
ggprism::theme_prism() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 90)
)
ggplot2::ggsave(
filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"),
plot = expr.plot,
width = 40,
height = 20,
units = "cm"
)
# Next thing to do:
# - Check the number of reads per sample in total.counts.per.sample.csv
# - Plot the reads distribution (all reads) per sample in a boxplot.
# - (Optional) Calculate the number of unmapped, multimapped, unique mapped to
# feature and unique mapped to no feature and plot these in a stacked bar graph.