Friday, April 3, 2015

Analysis of Agilent Array-CGH Data Using the R DNAcopy

# The code is for the analysis of Agilent high density Array-CGH data 
# to identify copy number variation regions. 
# A Bioconductor package DNAcopy is used. 
# The array analyzed is SurePrint G3 Mouse CGH Microarray 1x1M_1M, 
# but the code should work as well for other arrays

rm(list=ls())
library("DNAcopy")

#set the working directory where the raw data saved
setwd("~/aCGH/")


# define a function "convert_rawdata"
# This function is used to extract raw logR ratio data and chromosome positions to a data frame subject
convert_rawdata <- function (raw_file)
{
  temp.file <- read.table(file = raw_file, skip = 9, 
                          header=T, stringsAsFactors =FALSE)

# Omit dark corners and other probes which have no informations
  usable <- grep("chr",temp.file$SystematicName)

# Extract the positions
  temp.pos = as.vector(sapply(strsplit
            (as.character(temp.file[usable,]$SystematicName),":"), "[", 2))
  final.pos = sapply(strsplit(temp.pos,"-"),"[",1)

# Extract the Chromosome 
  temp.chr = as.vector(sapply(strsplit(as.character(
                          temp.file[usable,]$SystematicName),":"), "[", 1))
  final.chr <- gsub("chr", "", temp.chr)
  
# Prepare the data frame 
  
  obj_df <- data.frame(logR=temp.file[usable,]$LogRatio,
                       Chr=final.chr,
                       Pos=as.numeric(final.pos)/1000)
  return(obj_df)
}

# It will take quite a while for this step
data_1 <- convert_rawdata("Cy5_mutant1_vs_Cy3_WT.txt")
data_2 <- convert_rawdata("Cy5_mutant2_vs_Cy3_WT.txt")


# prepare for the CNA object
CNA.object <- CNA(cbind(data_1$logR,
                        data_2$logR),
                  data_1$Chr, # Not Numeric
                  as.numeric(data_1$Pos),
                  data.type="logratio",
                  sampleid=c("mutant1","mutant2")
)


# Run smoothing 
smotthed.CNA.object <- smooth.CNA(CNA.object)

# Run segmentation analysis
segment.smoothed.CNA.object <- segment(smotthed.CNA.object, verbose = 1)

# Check the initial plot
plot(segment.smoothed.CNA.object,plot.type="s")

# Save the output data 
segments <- segment.smoothed.CNA.object$output
signal_data <- as.data.frame(segment.smoothed.CNA.object$data)

# reset the positions
colnames(segments)
segments$loc.start = segments$loc.start*1000
segments$loc.end = segments$loc.end*1000
signal_data$maploc = signal_data$maploc*1000

# Log10 -> Log2 convertion 
# Log2 is generally used to interpret the data, 0.5 indicates 3 copy duplication.
signal_data[,c(3,4)] <- log2(10)*signal_data[,c(3,4)]
segments$seg.mean <-  log2(10)*segments$seg.mean

# Save the segmentation file
write.csv(segments, file="segmentation.csv")