Compare commits
	
		
			2 Commits
		
	
	
		
			step3/star
			...
			master
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | e3c88c059f | ||
|  | 1110d00e8c | 
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,4 +0,0 @@ | |||||||
|  |  | ||||||
| .DS_Store |  | ||||||
| *.nosync |  | ||||||
| *.RData |  | ||||||
							
								
								
									
										
											BIN
										
									
								
								rnaseq/general_files/describing_general_files.docx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								rnaseq/general_files/describing_general_files.docx
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										23
									
								
								rnaseq/step1_fastqc/00_fastqc.pl
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										23
									
								
								rnaseq/step1_fastqc/00_fastqc.pl
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | #!/usr/bin/perl -w | ||||||
|  | use strict; | ||||||
|  |  | ||||||
|  | # this script was made with consideration for UMI-deduplicating. | ||||||
|  | # this is because there are three .fastq files for each sample.  | ||||||
|  | # the provider states the info about which file contains which info, | ||||||
|  | # but in our case, from GenomeScan in Leiden, R2 contains the UMI read. | ||||||
|  | # R1 and R3 contain sequencing information from paired-end sequencing | ||||||
|  |  | ||||||
|  | foreach my $file1 ( <*_R1.fastq.gz> ) { | ||||||
|  |     my $file2 = $file1; | ||||||
|  |     $file2 =~ s/\_R1\./_R2./; | ||||||
|  |     my $file3 =~ s/\_R3\./_R2./; | ||||||
|  |     die "file1==file2" if $file1 eq $file2; | ||||||
|  |     my $sample = $file1; | ||||||
|  |     $sample =~ s/\_R1\.fastq\.gz$//; | ||||||
|  |     mkdir $sample.'_R1', 0700; | ||||||
|  |     system join(' ', 'fastqc', '-o', $sample.'_R1', $file1); | ||||||
|  |     mkdir $sample.'_R2', 0700; | ||||||
|  |     system join(' ', 'fastqc', '-o', $sample.'_R2', $file2); | ||||||
|  |     mkdir $sample.'_R3', 0700; | ||||||
|  |     system join(' ', 'fastqc', '-o', $sample.'_R3', $file3) | ||||||
|  | } | ||||||
							
								
								
									
										27
									
								
								rnaseq/step1_fastqc/00_fastqc.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								rnaseq/step1_fastqc/00_fastqc.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | #SBATCH --job-name=FastQC.for.alveolar_type_2 | ||||||
|  | #SBATCH --comment=FastQC.for.alveolar_type_2 | ||||||
|  | #SBATCH --time=48:00:00 | ||||||
|  | #SBATCH --mincpus=2 | ||||||
|  | #SBATCH --mem=20G | ||||||
|  | #SBATCH --qos=priority | ||||||
|  |  | ||||||
|  | # For 173 samples, it will take about 24 hrs to run with about 15Gb of memory. | ||||||
|  | # Should probably parallelize the perl script/make it a bash/slurm script. | ||||||
|  |  | ||||||
|  | module purge | ||||||
|  | module load Perl/5.26.2-foss-2015b-bare | ||||||
|  | module load BioPerl/1.6.924-foss-2015b-Perl-5.22.0 | ||||||
|  | module load Java/11.0.2 | ||||||
|  | module load FastQC/0.11.7-Java-1.8.0_144-unlimited_JCE | ||||||
|  |  | ||||||
|  | # Please see | ||||||
|  | # https://www.youtube.com/watch?v=0Rj_xNuyOyQ | ||||||
|  |  | ||||||
|  | cd /groups/umcg-griac/tmp04/projects/umcg-rbults/alveolar_type2_fastq/ | ||||||
|  | perl scripts/00_fastqc.pl | ||||||
|  |  | ||||||
|  | mkdir rene_FastQC.results | ||||||
|  | find . -maxdepth 1 -type d -iname "*_R[123]" -exec mv {} ./rene_FastQC.results/ \; | ||||||
|  | #find . -maxdepth 1 -type f -iname "*.htm*" -exec mv {} ./FastQC.results/ \; | ||||||
|  |  | ||||||
| @@ -1,5 +0,0 @@ | |||||||
| # The command to do a FastQC on a fastq file is |  | ||||||
| file="file_to_analyse.fq.gz" |  | ||||||
| fastqc_out="./path/to/fastqc/output/dir" |  | ||||||
|  |  | ||||||
| fastqc -o "$fastqc_out" "$file" |  | ||||||
|   | |||||||
| @@ -1,41 +1,36 @@ | |||||||
|  | ## this script is to generate jobs of trimming for each samples on the cluster | ||||||
|  | ## please run this script first and then submit the jobs for each samples | ||||||
|  | ## reference: http://www.usadellab.org/cms/?page=trimmomatic | ||||||
|  |  | ||||||
| #!/bin/bash | #!/bin/bash | ||||||
|  | # $1 indicates the path of raw samples.  | ||||||
|  | # In the input folder, one sample has one independent folder with two pair-end f | ||||||
|  | astq files.  | ||||||
|  | # The folder name should be the sample name.  | ||||||
|  | # the fastq file should be sample_1.fastq and sample_2.fastq  | ||||||
|  | # please prepare a sample.list that include file names for each sample | ||||||
|  |  | ||||||
| # reference: http://www.usadellab.org/cms/?page=trimmomatic | out="/ * your output folder * /" | ||||||
|  | input="/ * your input folder * /" | ||||||
|  | cat sample.list | while read line | ||||||
|  |  | ||||||
|  | do | ||||||
|  | sample=$(echo $line) | ||||||
|  | echo '#!/bin/bash'  > rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --job-name=RNAseq.${sample}" >> rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --error=RNAseq.${sample}.err" >> rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --output=RNAseq.${sample}.out" >> rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --mem=15gb" >> rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --time=6:00:00" >> rnaseq.${sample}.sh | ||||||
|  | echo "#SBATCH --cpus-per-task=6" >> rnaseq.${sample}.sh | ||||||
|  |  | ||||||
| module load Trimmomatic | echo "ml Java" >>rnaseq.${sample}.sh | ||||||
|  |  | ||||||
|  | echo "java -jar /* your folder of software */trimmomatic-0.36.jar PE \ | ||||||
|  |   -phred33 /$input/${sample}\_1.fq.gz /$input/${sample}\_2.fq.gz \ | ||||||
|  |    $out/trimmomatic/${sample}\_1_paired.fq $out/trimmomatic/${sample}\_1_unpaired.fq \ | ||||||
|  |    $out/trimmomatic/${sample}\_2_paired.fq $out/trimmomatic/${sample}\_2_unpaired.fq \ | ||||||
|  |    ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ | ||||||
|  |    LEADING:3 TRAILING:3 SLIDINGWINDOW:4:25 HEADCROP:8 MINLEN:50" >> rnaseq.${sample}.sh | ||||||
|  |  | ||||||
| # Adapters can be found at | done | ||||||
| # https://github.com/timflutre/trimmomatic/tree/master/adapters |  | ||||||
| # But should be verified with FastQC, or in another way. |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # (Example) Paired end |  | ||||||
| java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar PE \ |  | ||||||
|   -phred33 \ |  | ||||||
|   input_forward.fq.gz \ |  | ||||||
|   input_reverse.fq.gz \ |  | ||||||
|   output_forward_paired.fq.gz \ |  | ||||||
|   output_forward_unpaired.fq.gz \ |  | ||||||
|   output_reverse_paired.fq.gz \ |  | ||||||
|   output_reverse_unpaired.fq.gz \ |  | ||||||
|   ILLUMINACLIP: TruSeq3-PE.fa:2:30:10 \ |  | ||||||
|   LEADING:3 \ |  | ||||||
|   TRAILING:3 \ |  | ||||||
|   SLIDINGWINDOW:4:25 \ |  | ||||||
|   HEADCROP:8 \ |  | ||||||
|   MINLEN:50 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # (Example) Single end |  | ||||||
| java -jar $EBROOTTRIMMOMATIC/trimmomatic.jar SE \ |  | ||||||
|   -phred33 \ |  | ||||||
|   input.fq.gz \ |  | ||||||
|   output.fq.gz \ |  | ||||||
|   ILLUMINACLIP:TruSeq3-SE:2:30:10 \ |  | ||||||
|   LEADING:3 \ |  | ||||||
|   TRAILING:3 \ |  | ||||||
|   SLIDINGWINDOW:4:15 \ |  | ||||||
|   HEADCROP:8 \ |  | ||||||
|   MINLEN:50 |  | ||||||
| @@ -1,53 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # |  | ||||||
| # Align reads against reference genome. |  | ||||||
|  |  | ||||||
| STORAGE="/groups/umcg-griac/tmp04/rawdata/$(whoami)/step3" |  | ||||||
| # Store genome index in this location:. |  | ||||||
| GENOME_INDEX="${STORAGE}/genome_index" |  | ||||||
| mkdir -p "${GENOME_INDEX}" |  | ||||||
| # Store the generated `Aligned.sortedByCoord.out.bam` in this dir. |  | ||||||
| ALIGNMENT_OUTPUT="${STORAGE}/alignment" |  | ||||||
| mkdir -p "${ALIGNMENT_OUTPUT}" |  | ||||||
|  |  | ||||||
| # 1) Generate genome index. |  | ||||||
| # |  | ||||||
| # N.B.: |  | ||||||
| # - We're assuming a read size of 100 bp (--sjdbOverhang 100). Refer back to the |  | ||||||
| #   previous quality control steps if you are unsure about the size. In case of |  | ||||||
| #   reads of varying length, the ideal value is max(ReadLength)-1. |  | ||||||
| # - We're using gzip compressed reference data (--readFilesCommand zcat), i.e., |  | ||||||
| #   .gtf.gz and fa.gz. If not, you can remove the `zcat` flag. |  | ||||||
|  |  | ||||||
| # Storage location reference data (in this case on calculon). |  | ||||||
| REFERENCE_DATA="/groups/umcg-griac/prm02/rawdata/reference/genome" |  | ||||||
| GTF_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.100.gtf.gz" |  | ||||||
| FASTA_FILE="${REFERENCE_DATA}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" |  | ||||||
|  |  | ||||||
| STAR \ |  | ||||||
|     --runThreadN 8 \ |  | ||||||
|     --runMode genomeGenerate \ |  | ||||||
|     --readFilesCommand zcat \ |  | ||||||
|     --sjdbOverhang 100 \ |  | ||||||
|     --genomeFastaFiles ${FASTA_FILE} \ |  | ||||||
|     --sjdbGTFfile ${GTF_FILE} \ |  | ||||||
|     --genomeDir ${GENOME_INDEX} |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # 2) Do the actual alignment. |  | ||||||
| # |  | ||||||
| # N.B.: |  | ||||||
| # - We are assuming paired-end, gzip compressed (--readFilesCommand zcat)  FastQ |  | ||||||
| #   files. |  | ||||||
|  |  | ||||||
| # THe compressed paired-end FastQ's that we are aligning. |  | ||||||
| R1="sample1_R1.fastq.gz" |  | ||||||
| R2="sample1_R2.fastq.gz" |  | ||||||
|  |  | ||||||
| STAR \ |  | ||||||
|     --runThreadN 8 \ |  | ||||||
|     --readFilesCommand zcat \ |  | ||||||
|     --readFilesIn "${R1}" "${R2}" \ |  | ||||||
|     --outSAMtype BAM SortedByCoordinate \ |  | ||||||
|     --genomeDir ${GENOME_INDEX} \ |  | ||||||
|     --outFileNamePrefix "${ALIGNMENT_OUTPUT}" |  | ||||||
|   | |||||||
| @@ -1,81 +0,0 @@ | |||||||
| # Principle Component Analysis |  | ||||||
| # Normalized with limma::voom |  | ||||||
| library(limma) |  | ||||||
| library(tidyverse) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # expression.data. Has columns: |  | ||||||
| # - Gene (gene identifier) |  | ||||||
| # - [Sample identifiers] |  | ||||||
| expression.data <- "mrna_counts_table.csv" |  | ||||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) |  | ||||||
| results.dir <- "results" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # PCA variables |  | ||||||
| do.center = TRUE |  | ||||||
| do.scale = FALSE |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # The analysis |  | ||||||
| # We ise prcomp to calculate the PCAs. Afterwards you should plot the results. |  | ||||||
| norm.expr.data <- expression.data %>% |  | ||||||
| 	tibble::column_to_rownames("Gene") |  | ||||||
| norm.expr.data <- norm.expr.data[rowSums(norm.expr.data) >= 10,] %>% |  | ||||||
| 	limma::voom() %>% |  | ||||||
| 	as.matrix() |  | ||||||
|  |  | ||||||
| # Principle Component analysis |  | ||||||
| results.dir.pca <- file.path(results.dir, "principle.components") |  | ||||||
| dir.create(results.dir.pca, recursive=TRUE) |  | ||||||
|  |  | ||||||
| norm.expr.data.pcs <- norm.expr.data %>% |  | ||||||
|   t() %>% |  | ||||||
|   stats::prcomp( |  | ||||||
|   	center = do.center, |  | ||||||
|   	scale. = do.scale |  | ||||||
|   ) |  | ||||||
|  |  | ||||||
| # Write summary of PCAs to files |  | ||||||
| pcs.summery <- summary(norm.expr.data.pcs) |  | ||||||
| pcs.summery$importance %>% |  | ||||||
| 	t() %>% |  | ||||||
| 	as.data.frame() %>% |  | ||||||
| 	tibble::rownames_to_column("PC.name") %>% |  | ||||||
| 	readr::write_csv( |  | ||||||
| 		file.path(results.dir.pca, "importance.csv") |  | ||||||
| 	) |  | ||||||
|  |  | ||||||
| pcs.summery$x %>% |  | ||||||
| 	t() %>% |  | ||||||
| 	as.data.frame() %>% |  | ||||||
| 	tibble::rownames_to_column("ensembl.id") %>% |  | ||||||
| 	readr::write_csv( |  | ||||||
| 		file.path(results.dir.pca, "values.csv") |  | ||||||
| 	) |  | ||||||
|  |  | ||||||
| pcs.summery$rotation %>% |  | ||||||
| 	t() %>% |  | ||||||
| 	as.data.frame() %>% |  | ||||||
| 	tibble::rownames_to_column("sample.id") %>% |  | ||||||
| 	readr::write_csv( |  | ||||||
| 		file.path(results.dir.pca, "rotation.csv") |  | ||||||
| 	) |  | ||||||
|  |  | ||||||
| data.frame( |  | ||||||
| 		rownames = names(pcs.summery$center), |  | ||||||
| 		center = pcs.summery$center, |  | ||||||
| 		scale = pcs.summery$scale |  | ||||||
| 	) %>% |  | ||||||
| 	readr::write_csv( |  | ||||||
| 		file.path(results.dir.pca, "rest.csv") |  | ||||||
| 	) |  | ||||||
|  |  | ||||||
| # Not saved: pcs.summery$sdev, |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next thing to do: |  | ||||||
| # - (Optional) scree plot - to determine the optimal cutoff for PCA inclusion based on explaination of variance |  | ||||||
| # - (Optional) eigencorplot - to correlate PCAs to clinical variables so that you know which PCA to include for which analysis |  | ||||||
| # - (Optional) pairsplot - plot multiple PCAs against each other in a single figure |  | ||||||
| # - Plot the first couple of PCAs against each other |  | ||||||
| @@ -1,112 +0,0 @@ | |||||||
| # Gender QC |  | ||||||
| # Normalized with limma::voom |  | ||||||
| library(GSVA) |  | ||||||
| library(limma) |  | ||||||
| library(edgeR) |  | ||||||
| library(tidyverse) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # expression.data. Has columns: |  | ||||||
| # - Gene (gene identifier) |  | ||||||
| # - [Sample identifiers] |  | ||||||
| expression.data <- "mrna_counts_table.csv" |  | ||||||
| # master.Table. Has columns: |  | ||||||
| # - GenomeScan_ID |  | ||||||
| # - gender, levels = c("male", "female") |  | ||||||
| # - age |  | ||||||
| # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) |  | ||||||
| master.Table <- "patient_table.csv" |  | ||||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) |  | ||||||
| results.dir <- "results" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # The analysis |  | ||||||
| # We first do a differential expression analysis on gender using EdgeR. |  | ||||||
| # Afterwards you should plot these results. |  | ||||||
| x.genes <- gene.data %>% |  | ||||||
|     dplyr::filter(chromosome_name == "X") %>% |  | ||||||
|     dplyr::pull(ensembl_gene_id) |  | ||||||
|  |  | ||||||
| y.genes <- gene.data %>% |  | ||||||
|     dplyr::filter(chromosome_name == "Y") %>% |  | ||||||
|     dplyr::pull(ensembl_gene_id) |  | ||||||
|  |  | ||||||
| # Gender QC |  | ||||||
| results.dir.gender <- file.path(results.dir, "gender.check") |  | ||||||
| dir.create(results.dir.gender, recursive=TRUE) |  | ||||||
|  |  | ||||||
| # Differential Expression on Gender |  | ||||||
| gender.qc.patients <- master.Table %>% |  | ||||||
|     dplyr::filter( |  | ||||||
|         !is.na(GenomeScan_ID) |  | ||||||
|     ) %>% |  | ||||||
|     dplyr::mutate( |  | ||||||
|         gender = factor(gender, levels = c("male", "female")), |  | ||||||
|         age = as.numeric(age), |  | ||||||
|         smoking.status = factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) |  | ||||||
|     ) %>% |  | ||||||
|     dplyr::filter( |  | ||||||
|         !is.na(gender) |  | ||||||
|     ) |  | ||||||
| gender.qc.sample.order <- gender.qc.patients %>% |  | ||||||
|     dplyr::pull(GenomeScan_ID) |  | ||||||
|  |  | ||||||
| gender.qc.expression.data <- expression.data %>% |  | ||||||
|     tibble::column_to_rownames("Gene") %>% |  | ||||||
|     select.columns.in.order(gender.qc.sample.order) %>% |  | ||||||
|     as.matrix() |  | ||||||
|  |  | ||||||
| design <- model.matrix( ~0 + gender, data = gender.qc.patients) |  | ||||||
|  |  | ||||||
| DGEL <- edgeR::DGEList(gender.qc.expression.data) |  | ||||||
| keep <- edgeR::filterByExpr(DGEL) |  | ||||||
| keep[names(keep) %in% x.genes] <- TRUE |  | ||||||
| keep[names(keep) %in% y.genes] <- TRUE |  | ||||||
| DGEL <- DGEL[keep, , keep.lib.sizes=FALSE] |  | ||||||
| DGEL <- edgeR::calcNormFactors(DGEL, method = "TMM") |  | ||||||
|  |  | ||||||
| DGEL <- edgeR::estimateDisp(DGEL, design) |  | ||||||
| fit <- edgeR::glmQLFit(DGEL,design) |  | ||||||
|  |  | ||||||
| contrasts <- limma::makeContrasts( |  | ||||||
|     gender = gendermale - genderfemale, |  | ||||||
|     levels = design |  | ||||||
| ) |  | ||||||
| qlf   <- edgeR::glmQLFTest(fit, contrast = contrasts[,"gender"]) |  | ||||||
|  |  | ||||||
| gender.qc.results <- edgeR::topTags( |  | ||||||
|         qlf, |  | ||||||
|         n=nrow(DGEL) |  | ||||||
|     )$table %>% |  | ||||||
|     tibble::rownames_to_column("ensembl.id") %>% |  | ||||||
|     dplyr::left_join( |  | ||||||
|         y = gene.data, |  | ||||||
|         by = c("ensembl.id" = "ensembl_gene_id") |  | ||||||
|     ) %>% |  | ||||||
|     readr::write_csv( |  | ||||||
|         file.path(results.dir.gender, "differential.expression.on.gender.csv") |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
| # Plotting of gender expression |  | ||||||
| results.dir.gender.plot <- file.path(results.dir.gender, "img") |  | ||||||
| dir.create(results.dir.gender.plot, recursive=TRUE) |  | ||||||
|  |  | ||||||
| gender.qc.genes.to.plot <- gender.qc.results %>% |  | ||||||
|     dplyr::arrange(PValue) %>% |  | ||||||
|     dplyr::group_by(chromosome_name) %>% |  | ||||||
|     dplyr::filter( |  | ||||||
|         ( |  | ||||||
|             chromosome_name %in% c("X", "Y") & |  | ||||||
|             FDR < 0.05 & |  | ||||||
|             dplyr::row_number() <= 5 |  | ||||||
|         ) |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next thing to do: |  | ||||||
| # - Plot the normalized expressino values for the genes in gender.qc.genes.to.plot in a boxplot, split and colored by gender. |  | ||||||
| # - (Optional) Do a GSVA with as genesets the genes found in gender.qc.genes.to.plot. Plot the boxplots as per the previous point. |  | ||||||
| # - (Optional) Plot the number of Y-chromosome reads devided by the number of X chromosome reads in a boxplot as per the first point. |  | ||||||
| #     x=$(samtools view -q 20 -f 2 $bam_file X | wc -l) |  | ||||||
| #     y=$(samtools view -q 20 -f 2 $bam_file Y | wc -l) |  | ||||||
| # - (Optional) Plot the number of Y-chromosome SNPs devided by the number of X chromosome SNPs in a boxplot as per the first point. |  | ||||||
| @@ -1,39 +0,0 @@ | |||||||
| # Total counts per sample |  | ||||||
| # Normalized with limma::voom |  | ||||||
| library(limma) |  | ||||||
| library(tidyverse) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # expression.data. Has columns: |  | ||||||
| # - Gene (gene identifier) |  | ||||||
| # - [Sample identifiers] |  | ||||||
| expression.data <- "mrna_counts_table.csv" |  | ||||||
| # master.Table. Has columns: |  | ||||||
| # - GenomeScan_ID |  | ||||||
| # - gender, levels = c("male", "female") |  | ||||||
| # - age |  | ||||||
| # - factor(smoking.status, levels = c("Ex-smoker", "Current smoker")) |  | ||||||
| master.Table <- "patient_table.csv" |  | ||||||
| # results.dir. The directory of where to put the resulting tables (and later your plots.) |  | ||||||
| results.dir <- "results" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # The analysis |  | ||||||
| # We calculate the number of mapped reads per sample. |  | ||||||
| total.count.per.sample <- expression.data %>% |  | ||||||
| 	tibble::column_to_rownames("Gene") %>% |  | ||||||
| 	colSums() |  | ||||||
|  |  | ||||||
| data.frame( |  | ||||||
| 		sample = names(total.count.per.sample), |  | ||||||
| 		counts = as.numeric(total.count.per.sample) |  | ||||||
| 	) %>% |  | ||||||
| 	readr::write_csv(file.path(results.dir, "total.counts.per.sample.csv")) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next thing to do: |  | ||||||
| # - Check the number of reads per sample in total.counts.per.sample.csv |  | ||||||
| # - Plot the reads distribution (all reads) per sample in a boxplot. |  | ||||||
| # - (Optional) Calculate the number of unmapped, multimapped, unique mapped to |  | ||||||
| #   feature and unique mapped to no feature and plot these in a stacked bar graph. |  | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								rnaseq/step6_overall_QC/snippet.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								rnaseq/step6_overall_QC/snippet.R
									
									
									
									
									
										Normal file
									
								
							
		Reference in New Issue
	
	Block a user