Tuesday, May 26, 2015

How to Perform Basic RNA-seq Analysis with Cufflinks

Brief outline

This is a Perl wrapper for RNA-seq analysis to identify differentially expressed genes.
Basically I follow this Nature Protocol by Cufflinks' authors, but with a more updated workflow recommendatd for cufflinks 2.2 (in which the cuffquants is used to generate CXB file for final analysis).
The script can be download from this link as well.


#!/usr/bin/perl
use strict;
use 5.010;
use File::Basename qw/fileparse/;
use POSIX qw/strftime/;

# ----------------------------------------------------------------------
# Basic Information
# ----------------------------------------------------------------------
# For RNA-Seq Analysis of snoRNA-Knockdown experiment  
# @ RIKEN BSI, Mental Biology Team
# @ Author  : Psytky03
# @ Date : 2015-05-22
# @ Software Versions
# Tophat  : 2.014
# Bowtie 2 : 2.2.3
# cufflinks : 2.2.1
# samtools  : 0.1.19
# GFOLD  : 
# ----------------------------------------------------------------------


# ----------------------------------------------------------------------
# FASTQ samples - in fastq.gz format
# ----------------------------------------------------------------------
# GFP_B6_DIV4.fastq.gz
# GFP_ICR_DIV10.fastq.gz
# GFP_ICR_DIV4.fastq.gz
# MBII52_B6_DIV4.fastq.gz
# MBII52_ICR_DIV10.fastq.gz
# MBII52_ICR_DIV4.fastq.gz
# MBII85_B6_DIV4.fastq.gz
# MBII85_ICR_DIV10.fastq.gz
# MBII85_ICR_DIV4.fastq.gz
# ----------------------------------------------------------------------



# ----------------------------------------------------------------------
# Files and Parameters
# ----------------------------------------------------------------------
# my $simulate = 0; # run the program
  my $simulate = 1; # only show the simulated run and commands

  my $GTF_file     = "genes.gtf"; # UCSC GTF
  my $Genome_Fasta_Prefix  = "genome";  # mm10 
  my $Genome_Fasta_file  = "genome.fa"; # m10 whole genome fasta 
  
  my $assemblies_file  = "assemblies_2.txt"; # desired assemblies output file 

  my $tophat_threads = 11;
  my $cufflinks_thread  = 11;
  my $cuffmerge_thread  = 11;
# ----------------------------------------------------------------------




# ----------------------------------------------------------------------
# Run Tophat - Cufflinks 
# ----------------------------------------------------------------------
open(ASSEMBLIES, ">$assemblies_file");

my @gzfiles = <*.fastq.gz>;
my @accpeted_bam_files; 




foreach my $file (@gzfiles) 
{  
 my ($basename, $path, $suffix) = fileparse($file, qw/.fastq.gz/);
 my $current_time    = strftime('%Y-%m-%d %H:%M:%S',localtime);
 
 my $tophat_output_dir = $basename."_tophat_out";
 my $cufflinks_output_dir = $basename."_cufflinks_out";
 
 
 say "[$current_time]\tNow is processing $file"; 
 
 my $accepted_bam = "./$tophat_output_dir/accepted_hits.bam";
 push @accpeted_bam_files, $accepted_bam;
 
 my $tophat_command   = "tophat2 -p $tophat_threads -G $GTF_file -o $tophat_output_dir ".
          "$Genome_Fasta_Prefix $file";
        
 my $cufflinks_command  = "cufflinks -p $cufflinks_thread -o $cufflinks_output_dir ".
       "$tophat_output_dir/accepted_hits.bam";
 
 if ($simulate)
 { say($tophat_command);
  say($cufflinks_command);
 } else{
  system($tophat_command);
  system($cufflinks_command);
 }
 say ASSEMBLIES "./$cufflinks_output_dir/transcripts.gtf";
 sleep (1);
}

close ASSEMBLIES;
my $current_time    = strftime('%Y-%m-%d %H:%M:%S',localtime);
say "----------------------------------------------------------------------";
say "[$current_time] Finished Tophat - Cufflinks Process"     ;
say "----------------------------------------------------------------------";
# ----------------------------------------------------------------------



# ----------------------------------------------------------------------
# Run Cuffmerge to create a signle merged transcritome annotation
# ----------------------------------------------------------------------

my $cuffmerge_command = "cuffmerge -g $GTF_file -s $Genome_Fasta_file -p $cuffmerge_thread $assemblies_file";

if ($simulate){say($cuffmerge_command)} 
else{system($cuffmerge_command)}

my $current_time    = strftime('%Y-%m-%d %H:%M:%S',localtime);
say "----------------------------------------------------------------------";
say "[$current_time] Finished CuffMerge Process"       ;
say "----------------------------------------------------------------------";

# ----------------------------------------------------------------------




# ----------------------------------------------------------------------
# Run CuffQuant 
# ----------------------------------------------------------------------
my $merged_assemble_file = "./merged_asm/merged.gtf";
my $cuffquant_thread  = 11;

foreach my $bam_file (@accpeted_bam_files)
{
 my ($basename, $path, $suffix) = fileparse($bam_file, qw/.bam/);
 $path =~ s/_tophat_out\///g;
 my $cuffquant_output_dir = $path."_cuffquants_out";
 
 my $cuffquant_command = "cuffquant -p $cuffquant_thread -o $cuffquant_output_dir ".
       "$merged_assemble_file $bam_file";
 
 if ($simulate){say($cuffquant_command)} 
 else{system($cuffquant_command)}
 sleep (1);
}

my $current_time    = strftime('%Y-%m-%d %H:%M:%S',localtime);
say "----------------------------------------------------------------------";
say "[$current_time] Finished CuffQuant Process"  ;
say "----------------------------------------------------------------------";
# ----------------------------------------------------------------------