FastQC, Trimming, Overall QC (initial)

This commit is contained in:
Jos van Nijnatten 2021-02-09 12:52:44 +01:00
parent 7ecbc2aee9
commit 3d7c9e3b1c
9 changed files with 988 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.DS_Store
*.nosync
*.RData

BIN
rnaseq/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,34 @@
#!/bin/bash
#SBATCH --job-name=fastqc
#SBATCH --time=0-12:00:00
#SBATCH --ntasks=1
#SBATCH --mem=15G
#SBATCH --qos=regular
set -e
set -u
set -x
set -o pipefail
module purge
module load Java
module load FastQC
dir_raw_fastq="$(pwd)/fastq/raw"
dir_fastqc="$(pwd)/fastqc"
[ -d "${dir_fastqc}" ] || mkdir -p "${dir_fastqc}"
# Run FastQC
files=$(find -L "$dir_raw_fastq" -type f -iname "*.fastq.gz")
for file in $files; do
filename=$(basename "$file")
fastqc -o "$dir_fastqc" "$file" &
done
wait

View File

@ -0,0 +1,64 @@
#!/bin/bash
#SBATCH --job-name=trimming
#SBATCH --time=2-00:00:00
#SBATCH --ntasks=1
#SBATCH --mem=15G
#SBATCH --qos=regular
set -e
set -u
set -x
set -o pipefail
module purge
module load TrimGalore
dir_raw_fastq="$(pwd)/fastq/raw"
dir_trimmed_fastq="$(pwd)/fastq/trimmed"
dir_trimmed_fastq_reports="$(pwd)/fastq/trimmed/reports"
adapter_3p="TGGAATTCTCGG" # is _R1
adapter_5p="GATCGTCGGACT" # is _R2
[ -d "${dir_trimmed_fastq_reports}" ] || mkdir -p "${dir_trimmed_fastq_reports}"
# Trim all adapters from the sequences
while IFS=, read -r GSID Sample
do
echo ">> Executing trimming of $GSID (${#GSID})"
if [ "${#GSID}" == "18" ]; then # check length - don't include sample pools
for fqFile in $(find -L "$dir_raw_fastq" -maxdepth 1 -type f -iname "*${GSID}*.fastq.gz"); do
newFilename=${fqFile/.fastq.gz/_trimmed.fq.gz}
if [ -f "${newFilename}" ]; then
echo ">>File exists ${newFilename}. Skipping"
else
echo ">>File: ${fqFile}"
if grep -q "_R1" <<< $(basename "$fqFile"); then
adapter_seq=$adapter_3p
elif grep -q "_R2" <<< $(basename "$fqFile"); then
adapter_seq=$adapter_5p
fi
echo ">>Trimming with sequence ${adapter_seq}"
trim_galore \
--adapter "$adapter_seq" \
--length $(printf "$adapter_seq" | wc -m) \
--output_dir "${dir_trimmed_fastq}" \
--fastqc_args "--noextract" \
"${fqFile}" &
fi
done
else
echo "Skipped."
fi
done < samples.csv
wait
mv ${dir_trimmed_fastq}/*.zip "${dir_trimmed_fastq_reports}"
mv ${dir_trimmed_fastq}/*.txt "${dir_trimmed_fastq_reports}"
mv ${dir_trimmed_fastq}/*.html "${dir_trimmed_fastq_reports}"

View File

@ -0,0 +1,200 @@
# Principle Component Analysis
# Normalized with limma::voom
source("__ - Preloader.R", verbose=T)
# PCA variables
do.center = TRUE
do.scale = FALSE
# The analysis
master.Table %>% readr::write_csv(
file.path(results.dir, "patient.table.csv")
)
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
# Principle Component analysis
results.dir.pca <- file.path(results.dir, "principle.components")
dir.create(results.dir.pca, recursive=TRUE)
norm.expr.data.pcs <- norm.expr.data %>%
t() %>%
stats::prcomp(
center = do.center,
scale. = do.scale
)
# Write summary of PCAs to files
pcs.summery <- summary(norm.expr.data.pcs)
pcs.summery$importance %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("PC.name") %>%
readr::write_csv(
file.path(results.dir.pca, "importance.csv")
)
pcs.summery$x %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("ensembl.id") %>%
readr::write_csv(
file.path(results.dir.pca, "values.csv")
)
pcs.summery$rotation %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("sample.id") %>%
readr::write_csv(
file.path(results.dir.pca, "rotation.csv")
)
data.frame(
rownames = names(pcs.summery$center),
center = pcs.summery$center,
scale = pcs.summery$scale
) %>%
readr::write_csv(
file.path(results.dir.pca, "rest.csv")
)
# Not saved: pcs.summery$sdev,
# Plot PCAs
# https://github.com/kevinblighe/PCAtools
results.dir.pca.plot <- file.path(results.dir.pca, "img")
dir.create(results.dir.pca.plot)
metadata <- master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
tibble::column_to_rownames("GenomeScan_ID") %>%
select.rows.in.order(
colnames(norm.expr.data)
)
p <- pca(norm.expr.data,
metadata = metadata,
center = do.center,
scale = do.scale
)
elbow <- findElbowPoint(p$variance)
metavars <- c('Age','Gender','Smoking_status','COPD_Y_or_N','SEO_COPD_Y_or_N','GOLD_stage')
png(filename = file.path(results.dir.pca.plot,"scree_plot.png"),
width = 800, height = 800,
units = "px", pointsize = 12,
type = "Xlib")
print(screeplot(p,
axisLabSize = 12,
titleLabSize = 12,
components = getComponents(p, 1:(elbow+5)),
vline = c(elbow)
) +
geom_label(
aes(
x = elbow + 1,
y = 50,
label = 'Elbow method',
vjust = -1,
size = 8
)
))
dev.off()
png(filename = file.path(results.dir.pca.plot, "eigen_corr_plot.png"),
width = 1200, height = 1200,
units = "px", pointsize = 12,
type = "Xlib")
print(eigencorplot(p,
metavars = metavars
))
dev.off()
dir.create(file.path(results.dir.pca.plot, "pairsplot"))
for (var in metavars) {
png(
filename = file.path(results.dir.pca.plot, "pairsplot", paste0(var,".png")),
width = 1200, height = 1200,
units = "px", pointsize = 12,
type = "Xlib"
)
print(pairsplot(p,
components = getComponents(p, c(1:(elbow+1))),
triangle = TRUE,
trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE,
gridlines.minor = FALSE,
colby = var,
title = paste0('Pairs plot: ',var),
plotaxes = TRUE
))
dev.off()
}
# Plot PCAs - old failure
pca.combinations <- combinations(
n = (elbow+1),
r = 2,
v = 1:(elbow+1),
repeats.allowed = FALSE
)
dir.create(file.path(results.dir.pca.plot, "biplots"))
for (var in metavars) {
for (i in 1:nrow(pca.combinations)) {
pca.combi <- pca.combinations[i,]
pca.title <- paste(paste0("PC", pca.combi), collapse="_")
png(
filename = file.path(results.dir.pca.plot, "biplots", paste0(var, "-", pca.title, ".png")),
width = 800, height = 800,
units = "px", pointsize = 12,
type = "Xlib"
)
print(
autoplot(
norm.expr.data.pcs,
data = master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
tibble::column_to_rownames("GenomeScan_ID") %>%
select.rows.in.order(
rownames(norm.expr.data.pcs$x)
),
x = pca.combi[1],
y = pca.combi[2],
colour = var,
loadings = FALSE,
loadings.label = FALSE,
#label = FALSE,
label.size = 3
) +
ggprism::theme_prism() +
#ggprism::scale_colour_prism() +
ggprism::scale_shape_prism() +
ggplot2::labs(subtitle = paste0(str_to_title(var), " (", pca.title,")"))
)
dev.off()
}
}

View File

@ -0,0 +1,334 @@
# Gender QC
# Normalized with limma::voom
source("__ - Preloader.R", verbose=T)
# The analysis
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
x.genes <- gene.data %>%
dplyr::filter(chromosome_name == "X") %>%
dplyr::pull(ensembl_gene_id)
y.genes <- gene.data %>%
dplyr::filter(chromosome_name == "Y") %>%
dplyr::pull(ensembl_gene_id)
# Gender QC
results.dir.gender <- file.path(results.dir, "gender.check")
dir.create(results.dir.gender, recursive=TRUE)
# Differential Expression on Gender
gender.qc.patients <- master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
dplyr::mutate(
gender = factor(gender, levels = c("male", "female")),
age = as.numeric(age),
smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker"))
) %>%
dplyr::filter(
!is.na(gender)
)
gender.qc.sample.order <- gender.qc.patients %>%
dplyr::pull(GenomeScan_ID)
gender.qc.expression.data <- expression.data %>%
tibble::column_to_rownames("Gene") %>%
select.columns.in.order(gender.qc.sample.order) %>%
as.matrix()
design <- model.matrix( ~0 + gender, data = gender.qc.patients)
DGEL <- edgeR::DGEList(gender.qc.expression.data)
keep <- edgeR::filterByExpr(DGEL)
keep[names(keep) %in% x.genes] <- TRUE
keep[names(keep) %in% y.genes] <- TRUE
DGEL <- DGEL[keep, , keep.lib.sizes=FALSE]
DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM")
DGEL <- edgeR::estimateDisp(DGEL, design)
fit <- edgeR::glmQLFit(DGEL,design)
contrasts <- limma::makeContrasts(
gender = gendermale - genderfemale,
levels = design
)
qlf <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"])
gender.qc.results <- edgeR::topTags(
qlf,
n=nrow(DGEL)
)$table %>%
tibble::rownames_to_column("ensembl.id") %>%
dplyr::left_join(
y = gene.data,
by = c("ensembl.id" = "ensembl_gene_id")
) %>%
readr::write_csv(
file.path(results.dir.gender, "differential.expression.on.gender.csv")
)
# Plotting of gender expression
results.dir.gender.plot <- file.path(results.dir.gender, "img")
dir.create(results.dir.gender.plot, recursive=TRUE)
gender.qc.genes.to.plot <- gender.qc.results %>%
dplyr::arrange(PValue) %>%
dplyr::group_by(chromosome_name) %>%
dplyr::filter(
(
chromosome_name %in% c("X", "Y") &
FDR < 0.05 &
dplyr::row_number() <= 5
) |
hgnc_symbol %in% c(
"XIST",
"TSIX",
"KDM6A",
"ZFX",
"KDM5C",
"ZFY-AS1",
"ARSDP1",
"GYG2P1",
"RBMY2JP",
"ARSLP1"
)
)
gender.qc.data <- as.data.frame(norm.expr.data) %>%
rownames_to_column("ensembl.id") %>%
tidyr::gather(
key = "rna.seq.sample.id",
value = "expr.value",
-ensembl.id
) %>%
dplyr::filter(
ensembl.id %in% gender.qc.genes.to.plot$ensembl.id
) %>%
dplyr::left_join(
y = gender.qc.patients,
by = c("rna.seq.sample.id" = "GenomeScan_ID")
) %>%
#dplyr::filter(
# !is.na(gender)
#) %>%
readr::write_csv(
file.path(results.dir.gender.plot, "plot.data.voom.csv")
)
for (chr in gender.qc.genes.to.plot$chromosome_name) {
current.gender.qc.genes.to.plot <- gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == chr)
chromosome_name <- chr
i <- 0
for (current.ensembl.id in current.gender.qc.genes.to.plot$ensembl.id) {
i <- i + 1
hgnc_symbol <- gene.data %>%
dplyr::filter(
ensembl_gene_id == current.ensembl.id
) %>%
dplyr::pull(hgnc_symbol)
# calculate outliers, kinda
plot.data <- gender.qc.data %>%
dplyr::filter(
ensembl.id == current.ensembl.id
) %>%
dplyr::mutate(
gender = dplyr::case_when(
is.na(gender) | (stringr::str_trim(gender) == "") ~ "other",
TRUE ~ gender
)
)
if (nrow(plot.data) <= 0) {
next
}
outliers <- boxplot(
formula = expr.value ~ gender,
data = plot.data,
plot = FALSE
)$out
result.to.annotate <- plot.data %>%
dplyr::filter(
expr.value %in% outliers
)
# Visual: plot range (for t-test p-value)
plot.y.range <- c(
"min" = as.integer(min(plot.data$expr.value) - 1) ,
"max" = as.integer(max(plot.data$expr.value) + 1)
)
plot.margin <- ((plot.y.range["max"] + (plot.y.range["min"] * -1)) * 0.05)
plot.y.range["min"] <- plot.y.range["min"] - plot.margin
plot.y.range["max"] <- plot.y.range["max"] + plot.margin
# Plot the damn thing as if it is Graphpad Prism
stat.table <- rstatix::t_test(plot.data, expr.value ~ gender)
plt <- plot.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = gender,
y = expr.value
)
) +
ggplot2::geom_jitter(
mapping = ggplot2::aes(
colour = gender,
shape = gender
),
width = 0.1
) +
ggrepel::geom_text_repel(
data = result.to.annotate,
mapping = ggplot2::aes(
label = sample.id
),
size = 2,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
ggplot2::stat_summary(
fun = "mean",
geom = "crossbar",
width = 0.3,
size = 0.3
) +
ggplot2::scale_y_continuous(
limits = plot.y.range,
guide = "prism_offset"
) +
#ggprism::add_pvalue(
# stat.table,
# y.position = plot.y.range["max"]
#) +
ggprism::theme_prism() +
ggprism::scale_colour_prism() +
ggprism::scale_shape_prism() +
ggplot2::theme(
legend.position = "none"
) +
ggplot2::labs(
subtitle = paste0("Gender Check: ", hgnc_symbol, " (chr. ", chromosome_name, ")"),
x = "Gender",
y = "Normalised Expression Values"
)
ggplot2::ggsave(
filename = file.path(results.dir.gender.plot, paste0(chromosome_name, ".", i, ".", hgnc_symbol, ".png")),
plot = plt,
width = 12.5,
height = 12.5,
unit = "cm"
)
}
}
# Let's try a GSVA
gsva.groups <- list(
X = gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == "X") %>%
dplyr::pull(ensembl.id),
Y = gender.qc.genes.to.plot %>%
dplyr::filter(chromosome_name == "Y") %>%
dplyr::pull(ensembl.id)
)
gsva_res = GSVA::gsva(
norm.expr.data,
gsva.groups,
mx.diff = TRUE,
verbose = FALSE,
parallel.sz = 1
)
gender.qc.gsva.data <- as.data.frame(gsva_res) %>%
rownames_to_column("gsva.group") %>%
tidyr::gather(
key = "rna.seq.sample.id",
value = "gsva.value",
-gsva.group
) %>%
dplyr::left_join(
y = gender.qc.patients %>%
dplyr::select(
GenomeScan_ID,
sample.id,
gender
),
by = c("rna.seq.sample.id" = "GenomeScan_ID")
) %>%
readr::write_csv(
file.path(results.dir.gender.plot, "plot.data.gsva.csv")
)
for (c.gender in unique(gender.qc.gsva.data$gender)) {
if (is.na(c.gender)) {
next
}
c.plot.data <- gender.qc.gsva.data %>%
dplyr::filter(
gender == c.gender
)
outliers <- boxplot(
formula = gsva.value ~ gsva.group,
data = c.plot.data,
plot = FALSE
)$out
result.to.annotate <- c.plot.data %>%
dplyr::filter(
gsva.value %in% outliers
)
plt <- c.plot.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = gsva.group,
y = gsva.value
)
) +
ggplot2::geom_boxplot() +
ggrepel::geom_text_repel(
data = result.to.annotate,
mapping = ggplot2::aes(
label = sample.id
),
size = 2,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
ggprism::theme_prism() +
ggprism::scale_colour_prism() +
ggprism::scale_shape_prism() +
ggplot2::theme(
legend.position = "none"
) +
ggplot2::labs(
subtitle = paste0("", toupper(c.gender)),
x = "Chromosome",
y = "GSVA Values"
)
ggplot2::ggsave(
filename = file.path(results.dir.gender.plot, paste0("gsva.", c.gender, ".png")),
plot = plt,
width = 12.5,
height = 12.5,
unit = "cm"
)
}

View File

@ -0,0 +1,163 @@
# Total counts per sample
# Normalized with limma::voom
source("__ - Preloader.R", verbose=T)
# The analysis
norm.expr.data <- expression.data %>%
tibble::column_to_rownames("Gene")
norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>%
limma::voom() %>%
as.matrix()
# Total counts per sample
total.count.per.sample <- expression.data %>%
tibble::column_to_rownames("Gene") %>%
colSums()
data.frame(
sample = names(total.count.per.sample),
counts = as.numeric(total.count.per.sample)
) %>%
readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv"))
norm.data <- norm.expr.data %>%
as.data.frame() %>%
tibble::rownames_to_column(
"Gene"
) %>%
tidyr::gather(
key = "sample.id",
value = "expr.value",
-Gene
) %>%
dplyr::left_join(
y = master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
dplyr::mutate(
id = dplyr::case_when(
stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
TRUE ~ sample.id
),
gender = dplyr::case_when(
stringr::str_trim(gender) == "" ~ "water",
!is.na(gender) ~ as.character(gender)
)
) %>%
dplyr::select(
GenomeScan_ID,
gender,
id
),
by = c("sample.id" = "GenomeScan_ID")
)
norm.plot <- norm.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = id,
y = expr.value,
fill = gender
)
) +
ggplot2::geom_boxplot() +
ggplot2::scale_fill_manual(
values = c(
"male" = "blue",
"female" = "red",
"water" = "green"
)
) +
ggplot2::labs(
title = "Normalized expression values distribution",
y = "Normalized expression values (limma::voom)",
x = "Sample",
gender = "Gender"
) +
ggprism::theme_prism() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 90)
)
ggplot2::ggsave(
filename = file.path(results.dir, "counts.per.sample.normalised.png"),
plot = norm.plot,
width = 40,
height = 20,
units = "cm"
)
expr.data <- expression.data %>%
tidyr::gather(
key = "sample.id",
value = "expr.value",
-Gene
) %>%
dplyr::filter(
expr.value != 0
) %>%
dplyr::left_join(
y = master.Table %>%
dplyr::filter(
!is.na(GenomeScan_ID)
) %>%
dplyr::mutate(
id = dplyr::case_when(
stringr::str_trim(gender) == "" ~ paste0("Water ", dplyr::row_number()),
TRUE ~ sample.id
),
gender = dplyr::case_when(
stringr::str_trim(gender) == "" ~ "water",
!is.na(gender) ~ as.character(gender)
)
) %>%
dplyr::select(
GenomeScan_ID,
gender,
id
),
by = c("sample.id" = "GenomeScan_ID")
)
expr.plot <- expr.data %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
x = id,
y = expr.value,
fill = gender
)
) +
ggplot2::geom_boxplot() +
ggplot2::scale_fill_manual(
values = c(
"male" = "blue",
"female" = "red",
"water" = "green"
)
) +
ggplot2::scale_y_continuous(trans='log2') +
ggplot2::labs(
title = "Raw expression values distribution, without zero's",
y = "Expression values",
x = "Sample",
gender = "Gender"
) +
ggprism::theme_prism() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 90)
)
ggplot2::ggsave(
filename = file.path(results.dir, "counts.per.sample.raw.zeros.removed.png"),
plot = expr.plot,
width = 40,
height = 20,
units = "cm"
)

View File

@ -0,0 +1,189 @@
library(tidyverse)
library(ggfortify)
library(ggprism)
library(limma)
library(biomaRt)
library(PCAtools)
library(gtools)
library(edgeR)
library(ggprism)
library(foreign)
# Global variables
results.dir <- file.path("results.nosync", "RNA-Seq QC")
data.dir <- "Data"
patient.dir <- file.path(data.dir, "Patients")
sample.dir <- file.path(data.dir, "Samples")
expression.dir <- file.path(data.dir, "mRNA - RNA-Seq")
dir.create(results.dir, recursive = TRUE)
####
# Helper functions
####
select.columns.in.order <- function(dataframe, columns) {
dataframe[, columns]
}
select.rows.in.order <- function(dataframe, rows) {
dataframe[rows,]
}
getGenedataByEnsemblId38 <- function(ensemblIds, file.location) {
file.name <- file.path(file.location, "genes_info_hg38.csv")
if (!file.exists(file.name)) {
if (!("mart" %in% ls())) {
assign("mart", useEnsembl(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl"
))
}
gene.list <- getBM(
filters = "ensembl_gene_id",
attributes = c(
"hgnc_symbol",
"ensembl_gene_id",
"ensembl_transcript_id",
"chromosome_name",
"start_position",
"end_position",
"strand",
"transcription_start_site",
"transcript_start",
"transcript_end",
"external_gene_name"
),
values = as.character(ensemblIds),
mart = mart
)
readr::write_csv(gene.list, path = file.name)
}
return(
readr::read_csv(
file.name,
col_types = readr::cols()
)
)
}
#remove.rows.with.count.less.then <- function(dataframe, minRowCount, columns.to.exclude) {
# dataframe %>%
# dplyr::filter(
# rowSums(dplyr::select(., -tidyselect::one_of(columns.to.exclude))) < minRowCount
# )
#}
limma.voom.convert.column <- function(dataframe, columnname) {
dataframe %>%
tibble::column_to_rownames(columnname) %>%
limma::voom() %>%
as.data.frame() %>%
tibble::rownames_to_column(columnname)
}
select.columns.in.order <- function(dataframe, columns) {
dataframe[, columns]
}
drop.columns.if.all.same.value <- function(dataframe) {
for (name in colnames(dataframe)) {
is.all.same <- (dataframe[, name] %>% unique() %>% length()) <= 1
if (is.all.same) {
dataframe <- dataframe %>%
dplyr::select(
-tidyselect::one_of(name)
)
}
}
dataframe
}
# Load data
master.Table <- foreign::read.spss(
file.path(patient.dir, "PRESTO proteogenomics full data sat - ver 7.5.sav")
) %>%
as.data.frame() %>%
dplyr::mutate(
GenomeScan_ID = stringr::str_trim(GenomeScan_ID),
gender = forcats::fct_recode(
Gender,
female = "f",
male = "m",
other = ""
),
age = as.numeric(Age),
smoking.status = forcats::fct_recode(
Smoking_status,
`Ex-smoker` = "ES ",
`Current smoker` = "CS ",
other = " "
)
)
expression.data <- readr::read_tsv(
file.path(expression.dir, "20200427_103972-001_rawcounts.txt"),
col_types = readr::cols()
)
gene.data <- getGenedataByEnsemblId38(
ensemblIds = expression.data$Gene,
file.location = expression.dir
) %>%
dplyr::group_by(hgnc_symbol) %>%
dplyr::filter(
dplyr::row_number() == 1,
!is.na(hgnc_symbol),
hgnc_symbol != ""
) %>%
dplyr::ungroup() %>%
dplyr::select(
hgnc_symbol,
ensembl_gene_id,
chromosome_name,
transcript_start,
transcript_end
)
master.Table <- master.Table %>%
dplyr::mutate(
Group_simple2 = stringr::str_trim(Group_simple2),
Group_simple = stringr::str_trim(Group_simple),
T_number = as.character(T_number),
sample.id = stringr::str_trim(PRESTO_ID)
) %>%
dplyr::filter(
# # According to Niek, I should not include this, for whatever reason
#!(GenomeScan_ID %in% c(
# "T02-01796",
# "T02-03095",
# "T02-10683",
# "T10-18671",
# "T12-12036"
# )
# ),
#
# # (According to Niek, don't include) Water Controls
#!stringr::str_detect(T_number, pattern="Water"),
#
# # (According to Niek, don't include)never smoker controles
#!(Group_simple2 == "NS_Ctrl"),
#
# # (According to Niek, don't include)ALFA1 patiënten
#!(Group_simple == "ALFA"),
#
#stringr::str_trim(Passed_RNAseq_library_prep_QC_Y_N) == "Y",
GenomeScan_ID %in% colnames(expression.data),
!is.na(sample.id),
sample.id != ""
)
expression.data <- expression.data %>%
select.columns.in.order(
c("Gene", master.Table$GenomeScan_ID)
)