################################################## ## Author: Weibo Xie ## Date: Tue Aug 7 21:50:31 2007 ## ## ## Notes: Part of codes modified from Barley SFP Discovery Experiments by Nils Rostoks et al. ## The details of denomination of .CEL files in the CEL directory were stated in the 'readme.txt' file under the directory. ################################################## setwd("/usr/local/data/supplements/sfp/data/rice") source("../SFP_function.R") library(affy) filenames <- sort(list.celfiles("CEL/",pattern='[MZ][345]',full.names=TRUE)) ################################################## ## How many probe pairs which the signals of mm probes are larger than pm probes? ################################################## tmp.probes <- read.probematrix(filenames=sample(filenames,10),which="both") range(colMeans(tmp.probes$pm>tmp.probes$mm)) ##[1] 0.6564654 0.6849616 ################################################## ## What is the distribution of probes per probesets? ################################################## load("pmIndex.RData");load("EXPprobesets.RData") table(table(pmIndex$probeset)) ## ## 8 9 10 11 16 20 ## 71 67 64 57141 14 24 str(probeset11 <- grep("^Os", names(which(table(pmIndex$probeset)==11)),value=T)) str(probeset11 <- intersect(probeset11,EXPprobesets)) ##save(probeset11,file="probeset11.RData",compress=T) ################################################## ## To construct intensities matrix included all of probesets with 11 probes ################################################## load("all.pm.RData") riceRNA.pm <- log(all.pm[match(pmIndex$probeset,probeset11,nomatch=0)!=0,],2) # log base2 dd <- as.dist((1 - cor(riceRNA.pm)/2));plot(hclust(dd),main="Normalized PM Intensity",xlab="Chips") dev.print(pdf,file="images/hclust_by_cor_riceRNA_pm_int.pdf") rm(all.pm) gc() riceRNA.pm <- riceRNA.pm[,order(colnames(riceRNA.pm))] ##save(riceRNA.pm,file= "riceRNApm.RData", compress =T) ################################################## ## To do median polish and get residuals ################################################## load("riceRNApm.RData") system.time(pm.medpolishres <- get_medpolish_res(riceRNA.pm,ps.n=11)) ##1 minute ##save(pm.medpolishres,file="pm.medpolishres.RData",compress=T) ################################################## ## To identify SFPs from residuals of median polish ################################################## load("pm.medpolishres.RData") library(limma) t1 <- proc.time()[3] (cl <- factor(substring(colnames(pm.medpolishres),1,1))) design <- model.matrix(~0+cl);colnames(design) <- sub('cl','',colnames(design));contrast.matrix <- makeContrasts(Z-M,levels=design) fit <- lmFit(pm.medpolishres,design) fit <- contrasts.fit(fit,contrast.matrix) fit <- eBayes(fit) summary(fit.data <- decideTests(fit,p=1e-3)) t2 <- proc.time()[3] (t2-t1)/60 ## 48.6 seconds ##save(fit,file="fit.RData",compress=T) ##save(fit.data,file='fit.data.RData',compress=T) table(filter <- fit.data!=0) ## FALSE TRUE ## 215765 6655 (fixnum <- sum(fit.data!=0)) fit.data.mp <- fit.data str(mysfp <- data.frame(Probename=I(rownames(fit.data)),p.value=fit$p.value[,1],pq.value=p.adjust(fit$p.value[,1],"fdr"),t=fit$t[,1],lfc=fit$coe[,1])[filter,]) ##write.table(mysfp,file="rice_sfp.xls",row.name=F,quote=F,sep="\t") mysfp$Probeset <- sub("--.*",'',mysfp$Probename) ##How is the distribution of SFP per probeset? (tmp <- table(table(mysfp$Probeset))) ## 1 2 3 4 5 6 7 8 9 10 11 ## 1772 622 267 147 92 65 47 52 25 32 10 sum(tmp);tmp[1]/sum(tmp);sum(tmp[-(1:5)])/sum(tmp); ## [1] 3131 ## 1 ## 0.5659534 ## [1] 0.07377835 ##Does tt seem that SFPs in the same probeset tend to be the same genotype? str(tmp <- intersect(mysfp$Probeset[mysfp$t>0],mysfp$Probeset[mysfp$t<0])) length(tmp)/sum(table(table(mysfp$Probeset))[-1]) ## [1] 0.6637233 ##Cann't find this trend. Need more analysis ################################################## ## Try SAM? ################################################## require(siggenes) cl <- factor(substring(colnames(pm.medpolishres),1,1)) system.time(sam.out <- sam(pm.medpolishres,cl,rand = 123,B=500))##50 minutes ## user system elapsed ## 3046.08 467.20 3618.69 myq.value <- sam.out@q.value (thresh <- quantile(myq.value, fixnum/length(myq.value) )) fit.data <- matrix(rep(0,length(myq.value))) fit.data <- new("TestResults", array(0, c(length(myq.value),1))) rownames(fit.data) <- names(myq.value) fit.data[myq.value0] <- 1 fit.data[myq.value0] <- 1 fit.data[rostoks.pqvaluequantile(tmp,0.9),])/2));plot(hclust(dd),main="Residuals of Median Polish",xlab="Chips") ##dev.print(pdf,file="images/hclust_by_cor_pm.medpolishres_int.pdf") (cl <- factor(substring(colnames(pm.medpolishres),1,1))) res.diff1 <- abs(rowMeans(pm.medpolishres[,cl=='M'])-rowMeans(pm.medpolishres[,cl=='Z'])) res.diff2 <- abs(rowMeans(res.mat[,cl=='M'])-rowMeans(res.mat[,cl=='Z'])) tmpb <- seq(0,10,0.01) hist(pmin(res.diff1,10),tmpb,freq=T,xlim=c(0,1),main="The Distribution of Average Absolute Differences",xlab="Average Absolute Differences Between MH and ZS",col=8) hist(pmin(res.diff2,10),tmpb,freq=T,add=T,border=2) legend(locator(1),legend=c('Median polish','Fitting linear model'),fill=c(8,2)) dev.print(pdf,file="images/The Distribution of Average Absolute Differences_MP_vs_LN.pdf") ################What is the distribution of non-polymorphism genotype's signal intensities of SFP probes ##str(subset <- names(sfp.check)[snp.geno==sfp.check&sfp.check==1]) ##hist(rowMedians(riceRNA.pm[match(subset,rownames(riceRNA.pm)),grep('^Z',colnames(riceRNA.pm))]),100) ################Conclusion: almost larger than 4. ################################################## ## To make figures for article ################################################## load("pmIndex.RData") load("riceRNApm.RData") source("../SFP_function.R") x11(width = 7, height = 5.8, pointsize = 12);par(font.main=1,cex.main=1,font.lab=1,cex.lab=1) load("mysfp.RData") (tmp <- table(table(sub("at.*",'at',mysfp$Probe)))) tmp/sum(tmp) par(mfrow=c(1,1)) par(xpd=T) text(cbind(barplot(tmp <- table(table(sub("at.*",'at',mysfp$Probe))),ylim=c(0,3300),density=30,col=c("lightblue"),xlab="SFPs per ProbeSet",ylab="Number of ProbeSets"),tmp+60),as.character(tmp))#number of SFPs per ProbeSet dev.print(pdf,file="images/sfp_per_Probeset.pdf") text(cbind(barplot({tmp <- table(sub(".*at",'',rownames(pmIndex[match(mysfp$Probe,as.character(pmIndex$probename),nomatch=0),])));tmp <- tmp[order(as.numeric(names(tmp)))]},ylim=c(0,1000),density=30,col=c("lightblue"),xlab="Probe Order",ylab="Number of SFPs"),tmp+50),as.character(tmp)) dev.print(pdf,file="images/sfp_Probe_Order.pdf") ################################################## ## Does flanking SNP influence the hybridization of cRNA? ################################################## load("oryzaSNP.raw.RData") flstart <- 0;flend <- -9;str(flsfp.probes <- unique(oryzaSNP.raw$Probe_name[((oryzaSNP.raw$Pos_forward<=flstart&oryzaSNP.raw$Pos_forward>=flend)|(oryzaSNP.raw$Pos_backward<=flstart&oryzaSNP.raw$Pos_backward>=flend))&oryzaSNP.raw$MH63!=oryzaSNP.raw$Zhen_Shan_97])) flstart <- -10;flend <- -19;str(intflsfp.probes <- unique(oryzaSNP.raw$Probe_name[((oryzaSNP.raw$Pos_forward<=flstart&oryzaSNP.raw$Pos_forward>=flend)|(oryzaSNP.raw$Pos_backward<=flstart&oryzaSNP.raw$Pos_backward>=flend))&oryzaSNP.raw$MH63!=oryzaSNP.raw$Zhen_Shan_97])) str(flsfp.probes <- flsfp.probes[flsfp.probes%in%intersect(nosfp.probes,uniprobes)]) str(intflsfp.probes <- intflsfp.probes[intflsfp.probes%in%intersect(nosfp.probes,uniprobes)]) str(tmp <- intersect(flsfp.probes,intflsfp.probes)) str(flsfp.probes <- flsfp.probes[!flsfp.probes%in%tmp]) str(intflsfp.probes <- intflsfp.probes[!intflsfp.probes%in%tmp]) ##save(flsfp.probes,intflsfp.probes,file="flsfp.probes.RData",compress=T) load("fit.data.RData") load("fit.RData") load("nosfp.probes.RData") load("sfp.probes.RData") ##str(myfit.data <- data.frame(Probename=I(rownames(fit$p.value)),p.value=fit$p.value[,1],pq.value=p.adjust(fit$p.value[,1],"fdr"),t=fit$t[,1],lfc=fit$coefficients[,1])) ##save(myfit.data,file="myfit.data.RData",compress=T) load("myfit.data.RData") load("uniprobes.RData") load("flnosfp.probes.RData") load("flsfp.probes.RData") str(psubset <- intersect(c(flnosfp.probes,flsfp.probes,intflsfp.probes),nosfp.probes)) mydata <- myfit.data[match(psubset,myfit.data$Probename,nomatch=0),] mydata$flSNP <- as.numeric(mydata$Probename%in%flsfp.probes) mydata$intflSNP <- as.numeric(mydata$Probename%in%intflsfp.probes) ##save(mydata,file="mydata080224.RData",compress=T) ##load("mydata080224.RData") mean(mydata$flSNP>0);mean(mydata$intflSNP>0) ## [1] 0.01474636 ## [1] 0.02752654 filter <- mydata$pq.val<0.05&mydata$t!=0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp) ## FALSE TRUE ## 122 9 ## [1] 0.06870229 ## FALSE TRUE ## 121 10 ## [1] 0.07633588 filter <- mydata$pq.val<0.001&mydata$t!=0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp) ## FALSE TRUE ## 30 5 ## [1] 0.1428571 ## FALSE TRUE ## 33 2 ## [1] 0.05714286 ##table(filter <- mydata$pq.val<0.05&mydata$t!=0&(mydata$flSNP>0|mydata$intflSNP>0));tmp <- write.csv(cbind(mydata[filter,],oryzaSNP.raw[match(mydata$Probename[filter],oryzaSNP.raw$Probe_name),]),file="rice_flSNP_probes.csv", row.names = FALSE) str(tmp.flsnp.sfp <- mydata$Probename[mydata$pq.val<0.001&mydata$t!=0&(mydata$flSNP>0)]) par(mfcol=c(1,2)) brs <- quantile(mydata$pq.value,seq(0.0,1,0.002)) ##brs <- seq(0.0,1,0.001) tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$flSNP[mydata$pq.value0)})) ylim <- c(0,0.4) plot(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="",cex=0.5) title("Proportion of SFPs with SNPs in flanking 10 bp", cex.main= 0.75) tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$intflSNP[mydata$pq.value0)})) plot(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="",cex=0.5) title("Proportion of SFPs with SNPs in flanking 11-20 bp", cex.main= 0.75) ##lines(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="Proportion of SFPs with SNPs in flanking 10-20 bp",cex=0.5,col=8) dev.print(pdf,file="images/rice_flSNP_por.pdf",width=8, height=6) ################################################## ## Fisher exact test and Chi test ################################################## load("fit.data.RData") x<-table(mydata$flSNP[mydata$Probename%in%rownames(fit.data)[fit.data!=0]]>0);p <- table(mydata$flSNP);p <- p/sum(p) cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p)) cat("Fold: ",(x[2]/sum(x))/p[2],"\n") x<-table(mydata$intflSNP[mydata$Probename%in%rownames(fit.data)[fit.data!=0]]>0);p <- table(mydata$intflSNP);p <- p/sum(p) cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p)) cat("Fold: ",(x[2]/sum(x))/p[2],"\n") thread <- 0.001;x.t <- cbind(table(mydata$flSNP[mydata$pq.val>=thread&mydata$t!=0]>0),table(mydata$flSNP[mydata$pq.val0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t fisher.test(x.t,alter='great') x.t <- cbind(table(mydata$intflSNP[mydata$pq.val>=thread&mydata$t!=0]>0),table(mydata$intflSNP[mydata$pq.val0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t fisher.test(x.t,alter='great') ################################################################ ################################################## ## Does flsnp influence SFPs detected by using single tissue? ################################################## library(limma) load("riceRNApm.RData") load("pmIndex.RData") load('EXPprobesets.RData');load("uniprobes.RData"); for(stage in 3:5){ cat("STAGE",stage,"\n",sep="\t") filter.str <- paste('[MZ]',stage,sep='') filter.cols <- grep(filter.str,colnames(riceRNA.pm)) filter.rows <- T pm.medpolishres <- get_medpolish_res(riceRNA.pm[filter.rows,filter.cols],ps.n=11) ##save(pm.medpolishres,file=paste('pm.medpolishres_stage',stage,'.RData',sep=''),compress=T) ##load(paste('pm.medpolishres_stage',stage,'.RData',sep='')) str(pm.medpolishres) cl <- factor(substring(colnames(pm.medpolishres),1,1)) design <- model.matrix(~0+cl);colnames(design) <- c('G1','G2');contrast.matrix <- makeContrasts(G2-G1,levels=design) fit <- do_limma(pm.medpolishres,design,contrast.matrix) fit <- eBayes(fit) save(fit,file=paste('fit_stage',stage,'.RData',sep=''),compress=T) show(summary(fit.data <- decideTests(fit,p=0.02))) ##save(fit.data,file=paste('fit.data_stage',stage,'.RData',sep=''),compress=T) table(filter <- fit.data!=0) mysfp <- data.frame(Probename=I(rownames(fit.data)),p.value=fit$p.value[,1],t=fit$t[,1],lfc=fit$coe[,1])[filter,] mysfp$Probeset <- sub('-.*','',mysfp$Probename) ##save(mysfp,file=paste('mysfp_stage',stage,'.RData',sep=''),compress=T) } str(mysfp <- mysfp[mysfp$Probeset%in%EXPprobesets,]) fit.data[!rownames(fit.data)%in%uniprobes] <- 0 summary(fit.data) ##How is the distribution of SFP per probeset? (tmp <- table(table(mysfp$Probeset))) sum(tmp);tmp[1]/sum(tmp) ################################################## ## To get the same numbers of 6,655 SFPs ################################################## fixnum <- 6655 for(stage in 3:5){ cat("STAGE",stage,"\n",sep="\t") load(paste('fit_stage',stage,'.RData',sep='')) str(my.pqvalue <- p.adjust(fit$p.value[,1],'fdr')) (thresh <-quantile(my.pqvalue, fixnum/length(my.pqvalue))) fit.data <- matrix(rep(0,length(my.pqvalue))) fit.data <- new("TestResults", array(0, dim(fit$t))) dimnames(fit.data) <- dimnames(fit$t) fit.data[my.pqvalue0] <- 1 fit.data[my.pqvalue0));mean(tmp)