####################### Input file directorys ##########################################################3 root_data<-"D:/job/sista/txt agilent/EBV/original Agilent txt files" root_anno<-"D:/job/sista/txt agilent/blastomeres/perchannelnormalization/EBV/annotation/" root_GC<-"D:/job/sista/txt agilent/blastomeres/perchannelnormalization/EBV/files for GC percentage/" outputdir<-"D:/job/sista/txt agilent/blastomeres/perchannelnormalization/EBV/channelnormal+GC+afterwave/" thres_bg=5 #########################################################################3 test(root_data="D:/job/sista/txt agilent/EBV/original Agilent txt files", root_anno="D:/job/sista/txt agilent/blastomeres/perchannelnormalization/EBV/annotation/", root_GC="D:/job/sista/txt agilent/blastomeres/perchannelnormalization/EBV/files for GC percentage/", outputdir="D:/", thres_bg=5 ) test<-function(root_data,root_anno,root_GC,outputdir,thres_bg) { library(snapCGH) setwd(root_data) filenames<-dir(root_data,pattern=".txt") arraynumber<-substring(filenames,1,12) anno<-read.table(file=paste(root_anno,"agilent244k_annotation.txt",sep=""),header=TRUE) GC_newlist<-read.table(file=paste(root_GC,"GCpercent_newlist.txt",sep=""),header=TRUE) Internalcontrolflag<-((anno$chromosome=="0")|(anno$startbp=="0") |(is.na(anno$chromosome))|(is.na(anno$startbp)|(anno$startbp>anno$endbp))) randomflag<-(anno[,1])%in%(grep("random",anno[,1],value=TRUE)) annoclean<-anno[(!Internalcontrolflag)&(!randomflag),] for (i in 1:length(filenames)) { cat('i=',i,"\n") fileoriginal <- read.maimages(file=filenames[i], source = 'agilent', columns = list(G = "gMedianSignal", Gb = "gBGMedianSignal", R = "rMedianSignal", Rb = "rBGMedianSignal")) fileoriginal[1,] file<-fileoriginal file$genes<-cbind(anno,file$genes) cleandata<-file[(!Internalcontrolflag)&(!randomflag),] cleandata$genes$chromosome<-as.character(cleandata$genes$chromosome) cleandata$genes$chromosome[cleandata$genes$chromosome=='X']<-'23' cleandata$genes$chromosome[cleandata$genes$chromosome=='Y']<-'24' cleandata$genes$chromosome<-as.numeric(cleandata$genes$chromosome) cleandata<-cleandata[order(cleandata$genes$chromosome,cleandata$genes$startbp,cleandata$genes$endbp),] # Background filter x<-cleandata medianRbg <- apply(x$Rb[x$genes$chromosome < 23, , drop = FALSE], 2, median, na.rm = TRUE) medianGbg <- apply(x$Gb[x$genes$chromosome < 23, , drop = FALSE], 2, median, na.rm = TRUE) thres=thres_bg bgmedianflag<-((cleandata$R> (thres*medianRbg)) & (cleandata$G> (thres*medianGbg))) ################################# cleandata$R[!bgmedianflag]<-NA cleandata$G[!bgmedianflag]<-NA if (i==1) { probelist<-matrix(NA,nrow=nrow(cleandata),ncol=length(filenames)) Gprobelist<-data.frame(cbind(chromosome=cleandata$genes$chromosome,startbp=cleandata$genes$startbp, endbp=cleandata$genes$endbp,probename=cleandata$genes$ProbeName,probelist),stringsAsFactors =FALSE) Rprobelist<-Gprobelist } G<-log2(cleandata$G) R<-log2(cleandata$R) Gprobelist[,4+i]<-G names(Gprobelist)[4+i]<-substring(filenames[i],9,12) Rprobelist[,4+i]<-R names(Rprobelist)[4+i]<-substring(filenames[i],9,12) } # unique probe Gprobelist <- Gprobelist[order(Gprobelist$probename),] Rprobelist <- Rprobelist[order(Rprobelist$probename),] uniqueflag<-!duplicated(Gprobelist$probename) Rprobelist_test2=Gprobelist_test2=Rprobelist[uniqueflag,1:4] Rprobelist_test=Gprobelist_test=matrix(0,nrow=nrow(Rprobelist_test2),ncol=length(arraynumber)) for (i in 1:length(arraynumber)) { Gprobelist_test[,i]<- as.matrix(tapply(Gprobelist[,i+4], Gprobelist$probename, mean, na.rm = TRUE),ncol=1) Rprobelist_test[,i]<- as.matrix(tapply(Rprobelist[,i+4], Rprobelist$probename, mean, na.rm = TRUE),ncol=1) } Rprobelist_test3<-cbind(Rprobelist_test2,Rprobelist_test) Gprobelist_test3<-cbind(Gprobelist_test2,Gprobelist_test) names(Rprobelist_test3)<-names(Rprobelist) names(Gprobelist_test3)<-names(Gprobelist) Gmatrix<-as.matrix(Gprobelist_test3[,5:ncol(Gprobelist_test3)]) Rmatrix<-as.matrix(Rprobelist_test3[,5:ncol(Rprobelist_test3)]) annoname<-Rprobelist_test3[,1:4] # Standization of each channel. G<-Gmatrix R<-Rmatrix testG<-Gmatrix testR<-Rmatrix autoflag<-(as.numeric(annoname$chromosome)<23) for (i in 1:ncol(G)) { testG[,i]<-(G[,i]-mean(G[autoflag,i],trim=0.2,na.rm=TRUE))/sd(G[autoflag,i],na.rm=TRUE) testR[,i]<-(R[,i]-mean(R[autoflag,i],trim=0.2,na.rm=TRUE))/sd(R[autoflag,i],na.rm=TRUE) } G<-data.frame(cbind(annoname,testG),stringsAsFactors =FALSE) R<-data.frame(cbind(annoname,testR),stringsAsFactors =FALSE) # Genome composition correction ###################################### Rprobelist<-R[,5:ncol(R)] Gprobelist<-G[,5:ncol(G)] anno<-R[,1:4] M<-(Rprobelist-Gprobelist) file_unique<-cbind(anno,M) file_unique[,1]<-as.numeric(file_unique[,1]) file_unique[,2]<-as.numeric(file_unique[,2]) file_unique[,3]<-as.numeric(file_unique[,3]) flag_file<-(file_unique$probename %in% GC_newlist$probename) file_unique<-file_unique[flag_file,] flag_GC<- (GC_newlist$probename %in% file_unique$probename) GC_newlist<-GC_newlist[flag_GC,] GC_newlist<-GC_newlist[order(GC_newlist$probename),] file_unique<-file_unique[order(file_unique$probename),] flag_nona<-complete.cases(file_unique) file_GC<-file_unique file_GC[,1:ncol(file_GC)]<-NA for (j in 1:7) { maxGCcol<- which(abs(cor(GC_newlist[flag_nona,5:ncol(GC_newlist)],file_unique[flag_nona,4+j])) ==max(abs(cor(GC_newlist[flag_nona,5:ncol(GC_newlist)],file_unique[flag_nona,4+j])))) GC_newlist_order<-GC_newlist[order(GC_newlist$probename),] file_unique_order<-file_unique[order(file_unique$probename),] flag_NA<-(is.na(file_unique_order[,j+4])) if (j==1) { file_GC[,1:4]<-file_unique_order[,1:4] } # Important. If omit this step, lm will remove NAs, so residuals does not correspond # to the correct startbp. file_unique_order<-file_unique_order[!flag_NA,] GC_newlist_order<-GC_newlist_order[!flag_NA,] w=rep(0,nrow(GC_newlist_order)) w[GC_newlist_order[,4+maxGCcol]>0.5]<-1000 model<-lm(file_unique_order[,j+4]~GC_newlist_order[,4+maxGCcol],weight=w,y=TRUE) M_new<-model$residuals file_GC[!flag_NA,4+j] <- M_new file_GC[flag_NA,4+j] <- NA } for (i in 5:11) { file_GC[,i]<-(file_GC[,i]-median(file_GC[file_GC$chromosome<23,i],na.rm=TRUE)) } ###################################### # Recurrent genome artefacts correction for (i in 5:11) { file_GC[,i]<-(file_GC[,i]-median(file_GC[file_GC$chromosome<23,i],na.rm=TRUE)) } file<-file_GC residuals_tot<-file for (i in 1:length(filenames)) { thischr=23 residuals_tot[residuals_tot$chromosome==thischr,i+4]<-(residuals_tot[residuals_tot$chromosome==thischr,i+4]-median(residuals_tot[residuals_tot$chromosome==thischr,i+4],na.rm=TRUE)) } residuals_mean<-apply(residuals_tot[,5:ncol(residuals_tot)],1,function(x) mean(x,na.rm=TRUE,trim=0.2)) flag_NA<-!(is.na(residuals_mean)) Mprobelist<-file[flag_NA,] residuals_mean<-residuals_mean[flag_NA] waveffect = smooth.spline(residuals_mean,all.knots=TRUE)$y file_tot<-Mprobelist file_afterwave<-Mprobelist for (i in 5:ncol(file_tot)) { file_afterwave[,i]<-file_tot[,i]-waveffect } write.table(file_afterwave,file=paste(outputdir,"Channel_clone.txt",sep=""),quote=FALSE) }