library(affy)
library(limma)
library(siggenes)
library(genefilter)
setwd("/usr/local/data/supplements/sfp/data/yeast")
source("../SFP_function.R")
load("yeast_pmIndex.RData")
set.seed(123)
################################################################
library(limma)
filenames <- list.celfiles("CEL",full=T)
all.pm <- pm(normalize.AffyBatch.quantiles(bg.correct.rma(ReadAffy(filenames=filenames))))
colnames(all.pm) <- sub("(.*/)?(.*?)(.CEL)(.gz)?","\\2",colnames(all.pm))
rownames(all.pm) <- pmIndex$probename
save(all.pm,file="all.pm.RData",compress=T)

################################################################
table(table(pmIndex$probeset))
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##   28   27   18   23   25   25   21 1093   20   24   31   22   32   26   25 7835 
##   20   21   49   50   51 
##   46    5    1    4    4

str(probeset16 <-  names(which(table(pmIndex$probeset)==16)))
yeast.pm <- log(all.pm[match(pmIndex$probeset,probeset16,nomatch=0)!=0,],2)
rm(all.pm)
##save(yeast.pm,file="yeast.pm.RData",compress=T)

load("yeast.pm.RData")
system.time(pm.medpolishres <- get_medpolish_res(yeast.pm,ps.n=16))
##    user  system elapsed 
##   4.470   0.310   4.784 (seconds)
##save(pm.medpolishres,file="pm.medpolishres.RData",compress=T)
(cl <- factor(substring(colnames(pm.medpolishres),1,1)))
design <- model.matrix(~0+cl);colnames(design) <- sub('cl','',colnames(design));contrast.matrix <- makeContrasts(R-B,levels=design)
fit <- do_limma(pm.medpolishres,design,contrast.matrix)
##save(fit,file="fit.RData")
summary(fit.data <- decideTests(fit,p=0.001));table(filter <- fit.data!=0)
##save(fit.data,file="fit.data.RData")
##fixnum: to get the same number of SFPs
(fixnum <- sum(fit.data!=0))
str(mysfp <- data.frame(Probename=I(rownames(fit.data)),p.value=fit$p.value[,1],pq.value=p.adjust(fit$p.value[,1],"fdr"),t=fit$t[,1],lfc=fit$coe[,1])[filter,])
##write.table(mysfp,file="yeast_sfp.xls",row.name=F,quote=F,sep="\t")

###############To test Rostocks' method#################################################x
load("yeast.pm.RData")
library(limma)
tyeast.pm <- matrix(t(yeast.pm), nrow = ncol(yeast.pm) * 16, ncol = nrow(yeast.pm) / 16)
colnames(tyeast.pm) <- probeset16
fprobe <- factor(rep(paste("p",1:16,sep=""),each = ncol(yeast.pm)))
ftissue <- factor(rep(substr(colnames(yeast.pm),3,3), time=16))
fgenotype <- factor(rep(substr(colnames(yeast.pm),1,1), 16))
tmpdim <- dimnames(yeast.pm)
## Codes from Rostoks
# fit everything  except the SFP probe:genotype interaction
## set up full model as a test
i <- 1
#now record and remove the main effect of gene,tissue,genotype save residuals
res.mat <- matrix(nrow = nrow(tyeast.pm), ncol = ncol(tyeast.pm))
t1 <- proc.time()[3]
for (i in 1:ncol(tyeast.pm)){
# fit everything  except the SFP probe:genotype interaction
  fit <- summary(lm(tyeast.pm[,i] ~ fprobe + ftissue + fgenotype + ftissue:fgenotype))
  res.mat[,i] <- fit$residuals
  cat("\r",i)
}
t2 <- proc.time()[3]
(t2-t1)/60 ## 2.55 minutes

res.mat <- matrix(res.mat, nrow=length(tmpdim[[1]]),ncol=length(tmpdim[[2]]),dimnames=tmpdim,byrow=T)
rm(tmpdim)
##save(res.mat,file = "fit.residuals.RData", compres=T)

(cl <- factor(substring(colnames(res.mat),1,1)))
design <- model.matrix(~0+cl);colnames(design) <- sub('cl','',colnames(design));contrast.matrix <- makeContrasts(R-B,levels=design)
fit <- do_limma(res.mat,design,contrast.matrix)
##save(fit,file="fit_lm.RData")
summary(fit.data <- decideTests(fit,p=0.001))
str(my.pqvalue <- p.adjust(fit$p.value[,1],'fdr'))
(thresh <-quantile(my.pqvalue, fixnum/length(my.pqvalue)))
fit.data <- matrix(rep(0,length(my.pqvalue)))
fit.data <- new("TestResults", array(0, dim(fit$t)))
dimnames(fit.data) <- dimnames(fit$t)
fit.data[my.pqvalue<thresh&fit$t>0] <- 1
fit.data[my.pqvalue<thresh&fit$t<0] <- -1
print(summary(fit.data))


################################################################
load("fit.data.RData")
## yeast.snp <- read.csv("sfp.rm_080221.csv",as.is=T)[,1:2];
## mean(yeast.snp$SFP)## 116317
## save(yeast.snp,file="yeast.snp080221.RData")
load("yeast.snp080221.RData")##Polymorphism information in probe region
sfp.probes <- yeast.snp$Probename[yeast.snp$SFP>0]
nosfp.probes <- yeast.snp$Probename[yeast.snp$SFP<=0]
str(tmp <- union(sfp.probes,nosfp.probes))
str(tmp <- tmp[!tmp%in%intersect(sfp.probes,nosfp.probes)])
table(snp.geno <- sfp.check <- fit.data[rownames(fit.data)%in%tmp,])
snp.geno[snp.geno!=0] <- 0
snp.geno[names(snp.geno)%in%sfp.probes] <- -1
print(y <- table(snp.geno,sfp.check))
print(summary(y))
###residuals of median polish: RM > BY were ignored
cat("Sensitivity", sum(y[1,1])/sum(y[1,]), "\n" )
cat("False Negative Rate", 1-sum(y[1,1])/sum(y[1,]), "\n" )
cat("False Positive Rate", sum(y[2,1])/sum(y[2,]), "\n")
cat("False Sequence Polymorphism Rate", sum(y[2,1])/sum(y[,1]), "\n")
###residuals of median polish: RM > BY as FALSE discovery
cat("Sensitivity", sum(y[1,-2])/sum(y[1,]), "\n" )
cat("False Negative Rate", 1-sum(y[1,-2])/sum(y[1,]), "\n" )
cat("False Positive Rate", sum(y[2,-2])/sum(y[2,]), "\n")
cat("False Sequence Polymorphism Rate", sum(y[2,-2])/sum(y[,c(1,3)]), "\n")
################################################################

load("fit.data.RData")
load("pm.medpolishres.RData")

########The distribution of average absolute differences of residuals between two yeast strains BY and RM##################
x11(width = 7, height = 6.5, pointsize = 12);par(mfcol=c(1,1))
(cl <- factor(substring(colnames(pm.medpolishres),1,1)))
res.diff <- abs(rowMeans(pm.medpolishres[,cl=='R'])-rowMeans(pm.medpolishres[,cl=='B']))
tmpb <- seq(0,max(res.diff),length.out=100)
##hist(pmin(res.diff[fit.data==-1],8),tmpb,freq=F,xlim=c(0,8), ylim=c(0,0.7),main="",xlab=expression(group("|",italic(bar(E)[RM]) - italic(bar(E)[BY]) , "|")),col=8,border=0,ylab='Percent')
##hist(pmin(res.diff[fit.data==1],8),tmpb,freq=F,add=T,border="brown")
myplot.histogram(hist(pmin(res.diff[fit.data==-1],8),breaks = tmpb,plot=F),xlim=c(0,7),ylim=c(0,5),percent=T,main="",xlab=expression(group("|",italic(bar(E)[RM]) - italic(bar(E)[BY]) , "|")),col=8,border=0)##,ylab='Percent')
myplot.histogram(hist(pmin(res.diff[fit.data==1],8),tmpb,plot=F),percent=T,add=T,border="brown")
myusr <- par("usr");mylegend(myusr[1]+(myusr[2]-myusr[1])*0.6,myusr[3]+(myusr[4]-myusr[3])*0.9,legend=c(expression(italic(bar(E)[RM]) > italic(bar(E)[BY])),expression(italic(bar(E)[RM]) < italic(bar(E)[BY]))),fill=c(0,8),box.border=c("brown",0)) ##legend("topright",legend=c(expression(italic(bar(E)[RM]) > italic(bar(E)[BY])),expression(italic(bar(E)[RM]) < italic(bar(E)[BY]))),fill=c(0,8),inset = .15)

##dev.print(pdf,file="images/Figure2The Distribution of Average Absolute Differences_abs(RM-BY).pdf")
dev.off()

################################################################
## yeast.flsnp <- read.csv("flanksnp.rm_080221.csv",sep="\t",as.is=T);
## mean(yeast.flsnp$SNP01_10);mean(yeast.flsnp$SNP11_20)
## save(yeast.flsnp,file="yeast.flsnp080221.RData",compress=T)
load("yeast.flsnp080221.RData")
load("fit.RData")
table(is.na(sfporder <- match(yeast.flsnp$Probe,rownames(fit$p.value))))
str(mydata <- data.frame(Probename=yeast.flsnp$Probename,flSNP=yeast.flsnp$SNP01_10,intflSNP=yeast.flsnp$SNP11_20,p.value=fit$p.value[sfporder,1],pq.value=p.adjust(fit$p.value[,1],"fdr")[sfporder],t=fit$t[sfporder,1], stringsAsFactors = F))
##str(mydata <- mydata[mydata$t>0,])
sum(mydata$t>0);table(tmp <- mydata$flSNP[mydata$t>0]);mean(tmp);table(tmp <- mydata$intflSNP[mydata$t>0]);mean(tmp)
filter <- mydata$pq.val<0.05&mydata$t>0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp)
## FALSE  TRUE 
##   745    74 
## [1] 0.09035409

## FALSE  TRUE 
##   775    44 
## [1] 0.05372405
##table(filter <- mydata$pq.val<0.05&mydata$t>0&(mydata$flSNP>0|mydata$intflSNP>0));
##write.csv(cbind(mydata[filter,-(2:3)],yeast.flsnp[match(mydata$Probename[filter],yeast.flsnp$Probename),-1]),file="yeast_flSNP_probes.csv", row.names = FALSE)
filter <- mydata$pq.val<0.001&mydata$t>0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp)
## FALSE  TRUE 
##    73    13 
## [1] 0.1511628

## FALSE  TRUE 
##    85     1 
## [1] 0.01162791
##save(mydata,file="mydata080221.RData",compress=T)

##load("mydata080221.RData")

#########Figure 3 The fraction distribution of non-polymorphic probes with flanking SNPs in different statistical thresholds.##################
par(mfcol=c(1,2))
brs <- quantile(mydata$pq.value[mydata$t>0],seq(0.0,1,0.002))
##brs <- seq(0.0,1,0.001)
tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$flSNP[mydata$pq.value<x&mydata$t>0]>0,na.rm=T)}))
##tmp <- tmp[is.finite(log(tmp[,2])),]
ulnum <- max(tmp[,2],na.rm=T)+0.1;llnum <- min(tmp[,2],na.rm=T)
ylim <- c(llnum,ulnum)##c(0,0.4)
plot(tmp,type='o',ylim=ylim,ylab="Fraction of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="Flanking 10 bp",cex=0.5)
##title("Fraction of SFPs with SNPs in flanking 10 bp", cex.main= 0.75)
tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$intflSNP[mydata$pq.value<x&mydata$t>0]>0,na.rm=T)}))
plot(tmp,type='o',ylim=ylim,ylab="Fraction of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="Flanking 11-20 bp",cex=0.5)
##title("Fraction of SFPs with SNPs in flanking 11-20 bp", cex.main= 0.75)
##lines(tmp,type='o',ylim=ylim,ylab="Fraction of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="Fraction of SFPs with SNPs in flanking 10-20 bp",cex=0.5,col=8)

##dev.print(pdf,file="images/yeast_flSNP_por.pdf",width=8, height=6)


##################################################
## Fisher exact test and Chi test
##################################################
load("fit.data.RData")
x<-table(mydata$flSNP[mydata$Probename%in%rownames(fit.data)[fit.data>0]]>0);p <- table(mydata$flSNP);p <- p/sum(p)
cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p))
cat("Fold: ",(x[2]/sum(x))/p[2],"\n")
x<-table(mydata$intflSNP[mydata$Probename%in%rownames(fit.data)[fit.data>0]]>0);p <- table(mydata$intflSNP);p <- p/sum(p)
cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p))
cat("Fold: ",(x[2]/sum(x))/p[2],"\n")

thread <- 0.001;x.t <- cbind(table(mydata$flSNP[mydata$pq.val>=thread&mydata$t>0]>0),table(mydata$flSNP[mydata$pq.val<thread&mydata$t>0]>0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t
fisher.test(x.t,alter='great')
x.t <- cbind(table(mydata$intflSNP[mydata$pq.val>=thread&mydata$t>0]>0),table(mydata$intflSNP[mydata$pq.val<thread&mydata$t>0]>0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t
fisher.test(x.t,alter='great')

##################################################
## Does the free energy influence probe-target hibridyzation?
##################################################
###Select sequence pairs with probe flanking SNPs
##Condition 1. SNPs in probe flanking 1-10 bases
nrow(flank.snp.subset <- yeast.flsnp[yeast.flsnp$SNP01_10>0&yeast.flsnp$SNPpos!='0'&nchar(yeast.flsnp$seg)==151,])
##Condition 2. Monomorphic in probe region
nrow(flank.snp.subset <- flank.snp.subset[substr(flank.snp.subset$seg_seq,26,50)==substr(flank.snp.subset$seq01_10,11,35)&substr(flank.snp.subset$seg_seq,26,50)==substr(flank.snp.subset$seg_seq,102,126),])
##Condition 3. Just SNP, IN/DEL may influence the caculation of the changes of free energy, for the length of sequence paire is not the same
nrow(flank.snp.subset <- flank.snp.subset[-grep('[^ATGC@]',flank.snp.subset$seg),])
##save(flank.snp.subset,file="flank.snp.subset.RData",compress=T)
########Mfold package######cRNA##############
library(Biostrings)
file.BY <- "Flanking_75bp_BY_cRNA.fas";file.RM <- "Flanking_75bp_RM_cRNA.fas"
write.table(paste('>',flank.snp.subset$Probename,"\n",sapply(substr(flank.snp.subset$seg_seq,0,75),function(x){as.character(reverseComplement(DNAString(x)))}),sep=''),file=file.BY,row.name=F,col.name=F,sep="\t",quote=F);write.table(paste('>',flank.snp.subset$Probename,"\n",sapply(substr(flank.snp.subset$seg_seq,77,1000),function(x){as.character(reverseComplement(DNAString(x)))}),sep=''),file=file.RM,row.name=F,col.name=F,sep="\t",quote=F)
system(paste("hybrid-ss-min -E -n RNA -t 45 -T 45 ",file.BY,'|wc -l',sep=""));system(paste("hybrid-ss-min -E -n RNA -t 45 -T 45 ",file.RM,'|wc -l',sep=""))
dG.BY.cRNA <- read.table(paste(file.BY,".dG",sep=''),as.is=T,header=F)[,2];dG.RM.cRNA <- read.table(paste(file.RM,".dG",sep=''),as.is=T,header=F)[,2];ddG.cRNA <- dG.RM.cRNA-dG.BY.cRNA;
table(ddG.cRNA[ddG.cRNA!=0]<0);mean(ddG.cRNA[ddG.cRNA!=0]<0)
########Construct data frame##################
load("mydata080221.RData")
table(is.na(sfporder <- match(flank.snp.subset$Probe,mydata$Probename)))
free_energy <- cbind(mydata[sfporder,],BY.cRNA.dG=dG.BY.cRNA,RM.cRNA.dG=dG.RM.cRNA,ddG.cRNA=ddG.cRNA)
free_energy <- na.omit(free_energy)
##save(free_energy,file="free_energy.RData",compress=T)
load("free_energy.RData")
table(tmp <- free_energy$ddG.cRNA!=0);mean(tmp)
table(filter <- free_energy$pq.value<0.05&free_energy$t!=0&free_energy$ddG.cRNA!=0)
table(free_energy$ddG.cRNA[filter]<0);mean(free_energy$ddG.cRNA[filter]<0)

###############Additional file 2: The distribution of Delta_G_(RM- BY) indicated the influence of minimum free energy of RNA towards the binding affinity##############
x11(width = 7, height = 7, pointsize = 12)
par(mfcol=c(2,1),mar=c(4.5,4,2,2),xpd=F,font.main=1,cex.main=1,font.lab=1,cex.lab=1)
tmpbreaks <- hist(free_energy$ddG.cRNA,100,plot=F);myplot.histogram(tmpbreaks,xlab=expression(italic(paste(Delta,'G'))(kJ)),percent=T,col=8,border=0,ylim=c(0,3.5),main=expression(paste("The distribution of ",italic(paste(Delta,'G'[RM - BY],sep='')),sep='')));abline(v=c(1,-1)*quantile(abs(free_energy$ddG.cRNA),0.5),col=8,lty=2)
myplot.histogram(hist(free_energy$ddG.cRNA[free_energy$t>0&free_energy$pq.value<0.5],tmpbreaks$breaks,plot=F),percent=T,add=T,border=2)
myusr <- par("usr");legend(myusr[1]+(myusr[2]-myusr[1])*0.7,myusr[3]+(myusr[4]-myusr[3])*0.9,legend=c('All',expression(italic(bar(E)[RM]) > italic(bar(E)[BY]))),fill=c(8,2))
myusr <- par("usr");text(myusr[1]-(myusr[2]-myusr[1])*0.1,myusr[3]+(myusr[4]-myusr[3])*1.05, labels = '(a)',cex=1,xpd=T)
tmpbreaks <- hist(free_energy$ddG.cRNA,100,plot=F);myplot.histogram(tmpbreaks,xlab=expression(italic(paste(Delta,'G'))(kJ)),percent=T,col=8,border=0,ylim=c(0,3.5),main='');abline(v=c(1,-1)*quantile(abs(free_energy$ddG.cRNA),0.5),col=8,lty=2)
myplot.histogram(hist(free_energy$ddG.cRNA[free_energy$t<0&free_energy$pq.value<0.5],tmpbreaks$breaks,plot=F),percent=T,add=T,border=3)
myusr <- par("usr");legend(myusr[1]+(myusr[2]-myusr[1])*0.7,myusr[3]+(myusr[4]-myusr[3])*0.9,legend=c('All',expression(italic(bar(E)[RM]) < italic(bar(E)[BY]))),fill=c(8,3))
myusr <- par("usr");text(myusr[1]-(myusr[2]-myusr[1])*0.1,myusr[3]+(myusr[4]-myusr[3])*1.05, labels = '(b)',cex=1,xpd=T)
##dev.print(pdf,file="images/SFigure2_dist_free_energy.pdf")
dev.off()

##nrow(tmp.free_energy <- free_energy[free_energy$ddG.cRNA!=0,])

##################Figure 5: The fraction distribution of the free energy changes in non-polymorphic probes with flanking SNPs under different statistical threholds###############
x11(width = 7, height = 7, pointsize = 12)
par(mfrow=c(1,1),xpd=F,font.main=1,cex.main=1,font.lab=1,cex.lab=1)
#####Select threshold to filter not-changed or less-changed sequence pairs
(fth <- quantile(abs(free_energy$ddG.cRNA),0.5))##sd(free_energy$ddG.cRNA)
brs <- quantile(free_energy$pq.value,seq(0.02,1,0.005))##sort(c(10^-seq(10,8,-0.5),quantile(mydata$pq.value,seq(0,1,0.01),na.rm=T)))
##seq(0,1,0.02)brs <- seq(0,1,0.001)
tmp <- cbind(brs,sapply(brs,function(x){mean(free_energy$ddG.cRNA[free_energy$pq.val<x&free_energy$t>0&abs(free_energy$ddG.cRNA)>fth]>0,na.rm=T)}))
ulnum <- max(tmp[,2],na.rm=T)+0.1;llnum <- min(tmp[,2],na.rm=T)
plot(tmp,type='o',ylim=c(0.2,0.8),ylab=expression(paste("Fraction of sequence pairs with ",italic(paste(Delta,'G'[RM - BY],sep=''))>0,sep="")),xlab="BH adjusted p-value",main="",cex=0.5,col="brown")##Sequence pairs of which FLSNPs increased their minimum RNA free energy"
tmp <- cbind(brs,sapply(brs,function(x){mean(free_energy$ddG.cRNA[free_energy$pq.val<x&free_energy$t<0&abs(free_energy$ddG.cRNA)>fth]>0,na.rm=T)}))
tmp <- tmp[is.finite(log(tmp[,2])),]
lines(tmp,type='o',col='black',cex=0.5)
abline(h=0.5,col=8,lty=2)
myusr <- par("usr");legend(myusr[1]+(myusr[2]-myusr[1])*0.6,myusr[3]+(myusr[4]-myusr[3])*0.9,legend=c(expression(italic(bar(E)[RM]) > italic(bar(E)[BY])),expression(italic(bar(E)[RM]) < italic(bar(E)[BY]))),fill=c('brown','black'))

##dev.print(pdf,file="images/Figure5FLSFPs_free_energy.pdf")
dev.off()

################
##Fisher exact test
str(tmp.free_energy <- free_energy[abs(free_energy$ddG.cRNA)>fth,])
thread <- 0.05

##########Group: RM > BY
filter.t <- tmp.free_energy$t>0
table(filter.energy <- tmp.free_energy$ddG.cRNA[filter.t]>0)###free energy RM > BY
table(filter.thread <- tmp.free_energy$pq.value[filter.t]<thread)###SFP detection threshold < 0.05
(x.t <- table(filter.energy,filter.thread))
colnames(x.t) <- paste(c('>','<'),thread,sep='');rownames(x.t) <- c('D<0','D>0');x.t
fisher.test(x.t,alter='great')
##              filter.thread
## filter.energy >0.05 <0.05
##           D<0   472    11
##           D>0   541    29
## 	Fisher's Exact Test for Count Data
## data:  x.t 
## p-value = 0.01221
## alternative hypothesis: true odds ratio is greater than 1 
## 95 percent confidence interval:
##  1.220885      Inf 
## sample estimates:
## odds ratio 
##   2.298326 

##########Group: RM < BY
filter.t <- tmp.free_energy$t<0
table(filter.energy <- tmp.free_energy$ddG.cRNA[filter.t]>0)###free energy RM > BY
table(filter.thread <- tmp.free_energy$pq.value[filter.t]<thread)###SFP detection threshold < 0.05
(x.t <- table(filter.energy,filter.thread))
colnames(x.t) <- paste(c('>','<'),thread,sep='');rownames(x.t) <- c('D<0','D>0');x.t
fisher.test(x.t,alter='less')
##              filter.thread
## filter.energy >0.05 <0.05
##           D<0   351     5
##           D>0   308     2
## 	Fisher's Exact Test for Count Data
## data:  x.t 
## p-value = 0.2859
## alternative hypothesis: true odds ratio is less than 1 
## 95 percent confidence interval:
##  0.000000 2.219191 
## sample estimates:
## odds ratio 
##  0.4563597 


###Chi test
filter.t <- tmp.free_energy$t>0;p <- table(tmp.free_energy$ddG.cRNA[filter.t]>0);p <- p/sum(p);x<-table(tmp.free_energy$ddG.cRNA[tmp.free_energy$pq.value<thread&filter.t]>0);cbind(p,x,x/sum(x))
(rchi <- chisq.test(x, p = p))
##X-squared = 5.4358, df = 1, p-value = 0.01973
filter.t <- tmp.free_energy$t<0;p <- table(tmp.free_energy$ddG.cRNA[filter.t]>0);p <- p/sum(p);x<-table(tmp.free_energy$ddG.cRNA[tmp.free_energy$pq.value<thread&filter.t]>0);cbind(p,x,x/sum(x))
(rchi <- chisq.test(x, p = p))
##X-squared = 0.909, df = 1, p-value = 0.3404

##################################################
## nucleotide composition
##################################################
load("mydata080221.RData")
load("flank.snp.subset.RData")
str(tmp <- flank.snp.subset$Probename[flank.snp.subset$basepaire!='0'])
##Condition: probes with unique SNP in flanking 1-10 bases region
str(tmp <- intersect(flank.snp.subset$Probename[-grep('\\|[0-9\\-]',flank.snp.subset$SNPpos)],tmp))
str(flSNP.basepaire <- data.frame(mydata[match(tmp,mydata$Probename),],flank.snp.subset[match(tmp,flank.snp.subset$Probename),c('basepaire','rmbase')]))
str(flSNP.basepaire <- na.omit(flSNP.basepaire))


##(tmp <- table(flSNP.basepaire$base))
##a <- table(flSNP.basepaire$base[flSNP.basepaire$p.val<0.01&flSNP.basepaire$t>0]);a <- a[match(names(tmp),names(a))];names(a) <- names(tmp);a*100/tmp;a <- table(flSNP.basepaire$base[flSNP.basepaire$p.val<0.01&flSNP.basepaire$t<0]);a <- a[match(names(tmp),names(a))];names(a) <- names(tmp);a*100/tmp

#############Figure 4: The fraction distribution of nucleotide content of SNPs flanking non-polymorphic probes under different statisitcal thresholds.##############
x11(width = 7, height = 5.8, pointsize = 12)
par(mfrow=c(2,2),xpd=F,mar=c(4.1,4.1,2.1,2.1),font.main=1,cex.main=1)
brs <- quantile(flSNP.basepaire$pq.value,seq(0,1,0.01))##sort(c(10^-seq(10,8,-0.5),quantile(mydata$pq.value,seq(0,1,0.01),na.rm=T)))
nuc.f <- function(ppair='',tlab='',ylab="Fraction",xlab="BH adjusted p-value",main=ppair){
  tmp <- cbind(brs,sapply(brs,function(x){mean(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val<x&flSNP.basepaire$t>0]==ppair,na.rm=T)}))
  ulnum <- max(tmp[,2],na.rm=T)+0.1;llnum <- min(tmp[,2],na.rm=T)
  plot(tmp,type='o',ylim=c(0.0,0.5),ylab=ylab,xlab=xlab,main=main,cex=0.5,col="brown")
  tmp <- cbind(brs,sapply(brs,function(x){mean(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val<x&flSNP.basepaire$t<0]==ppair,na.rm=T)}))
  tmp <- tmp[is.finite(log(tmp[,2])),]
  lines(tmp,type='o',col="black",cex=0.5)
  abline(h=mean(flSNP.basepaire$rmbase==ppair,na.rm=T),col=8,lty=2)
  myusr <- par("usr");text(myusr[1]-(myusr[2]-myusr[1])*0.15,myusr[3]+(myusr[4]-myusr[3])*1.1, labels = tlab,cex=1,xpd=T)
}
##for(ppair in sort(unique(flSNP.basepaire$rmbase))){nuc.f(ppair)}
nuc.f(ppair='A',tlab='(a)',xlab='',main="Adenine");nuc.f(ppair='G',tlab='(b)',ylab='',xlab='',main="Guanine");nuc.f(ppair='C',tlab='(c)',main="Cytosine");nuc.f(ppair='T',tlab='(d)',ylab='',main="Thymine");

##myusr <- par("usr");legend(myusr[1]+(myusr[2]-myusr[1])*0.4,myusr[3]+(myusr[4]-myusr[3])*0.95,legend=c(expression(italic(bar(E)[RM]) > italic(bar(E)[BY])),expression(italic(bar(E)[RM]) < italic(bar(E)[BY]))),fill=c(1,"brown"))

dev.print(pdf,file="images/Figure4FLSFPs_base_dist.pdf")##,width = 7, height = 5.8)
dev.off()


################
##Fisher exact test
thread <- 0.05;x.t <- cbind(table(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val>=thread&flSNP.basepaire$t!=0]),table(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val<thread&flSNP.basepaire$t!=0]));colnames(x.t) <- paste(c('>','<'),thread,sep='');x.t
##   >0.05 <0.05
## A   791    29
## C   804    26
## G   784    20
## T   821    21
filter.t <- flSNP.basepaire$t<0;p <- table(flSNP.basepaire$rmbase[filter.t]);p <- p/sum(p);x<-sapply(names(p),function(y){sum(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val<thread&filter.t]==y)});cbind(p,x)
##           p  x
## A 0.2284186  2
## C 0.2666157 11
## G 0.2627960 10
## T 0.2421696  4
(rchi <- chisq.test(x, p = p))
##X-squared = 6.9978, df = 3, p-value = 0.07197
filter.t <- flSNP.basepaire$t>0;p <- table(flSNP.basepaire$rmbase[filter.t]);p <- p/sum(p);x<-sapply(names(p),function(y){sum(flSNP.basepaire$rmbase[flSNP.basepaire$pq.val<thread&filter.t]==y)});cbind(p,x)
##           p  x
## A 0.2622043 27
## C 0.2420735 15
## G 0.2315048 10
## T 0.2642174 17
(rchi <- chisq.test(x, p = p))
##X-squared = 6.8768, df = 3, p-value = 0.07593


##################################################
## Wed Aug 13 18:12:14 2008 Weibo Xie
## To compare with PDNN method using ronald's check data
##################################################
str(ronald.data <- read.table('ronald_data.txt',header=T,stringsAsFactors=F))
## 'data.frame':	30695 obs. of  2 variables:
##  $ Probename: chr  "10000_at--102--121" "10001_at--105--121" "10001_at--110--121" "10001_at--111--121" ...
##  $ SNP      : int  1 0 0 0 0 0 0 0 0 0 ...
str(filenames <- list.celfiles("CEL",pattern='D',full=T))
all.pm <- pm(normalize.AffyBatch.quantiles(bg.correct.rma(ReadAffy(filenames=filenames))))
colnames(all.pm) <- sub("(.*/)?(.*?)(.CEL)(.gz)?","\\2",colnames(all.pm))
load("yeast_pmIndex.RData")
rownames(all.pm) <- pmIndex$probename
str(probeset16 <-  names(which(table(pmIndex$probeset)==16)))
yeast.pm <- log(all.pm[match(pmIndex$probeset,probeset16,nomatch=0)!=0,],2)
opm.medpolishres <- get_medpolish_res(yeast.pm,ps.n=16)

##load("pm.medpolishres.RData")
str(pm.medpolishres <- opm.medpolishres[rownames(opm.medpolishres)%in%ronald.data$Probe,grep('D',colnames(opm.medpolishres))])
(cl <- factor(substring(colnames(pm.medpolishres),1,1)))
design <- model.matrix(~0+cl);colnames(design) <- sub('cl','',colnames(design));contrast.matrix <- makeContrasts(R-B,levels=design)
fit <- do_limma(pm.medpolishres,design,contrast.matrix)
(fixnum <- summary(fit.data <- decideTests(fit,p=0.05)));table(filter <- fit.data!=0)

fit.pqvalue <- p.adjust(fit$p.value[,1],'fdr')
(thresh <-quantile(fit.pqvalue[fit$t<0], 1049/length(fit.pqvalue[fit$t<0])))
fit.data <- matrix(rep(0,length(fit.pqvalue)))
fit.data <- new("TestResults", array(0, dim(fit$t)))
dimnames(fit.data) <- dimnames(fit$t)
fit.data[fit.pqvalue<thresh&fit$t>0] <- 1
fit.data[fit.pqvalue<thresh&fit$t<0] <- -1
table(fit.data)
## fit.data
##    -1     0     1 
##  1049 29443    92 


str(sfp.probes <- na.omit(ronald.data$Probe[ronald.data$SNP!=0]))
str(nosfp.probes <- na.omit(ronald.data$Probe[ronald.data$SNP==0]))
str(tmp <- union(sfp.probes,nosfp.probes))
table(snp.geno <- sfp.check <- fit.data[rownames(fit.data)%in%tmp,])
snp.geno[snp.geno!=0] <- 0
snp.geno[names(snp.geno)%in%sfp.probes] <- -1
print(y <- table(snp.geno,sfp.check))
##         sfp.check
## snp.geno    -1     0     1
##       -1  1007  1154     3
##       0     26 28078    83
print(summary(y))
cat("Sensitivity", sum(y[1,1])/sum(y[1,]), "\n" )
cat("False Negative Rate", 1-sum(y[1,1])/sum(y[1,]), "\n" )
cat("False Positive Rate", sum(y[2,1])/sum(y[2,]), "\n")
cat("False Sequence Polymorphism Rate", sum(y[2,1])/sum(y[,1]), "\n")
## Sensitivity 0.465342 
## False Negative Rate 0.534658 
## False Positive Rate 0.000922411 
## False Sequence Polymorphism Rate 0.02516941