##################################################
## Author: Weibo Xie <xwbmail@gmail.com>
## Date: Tue Aug  7 21:50:31 2007
##
##
## Notes: Part of codes modified from Barley SFP Discovery Experiments by Nils Rostoks et al. <http://naturalsystems.uchicago.edu/naturalvariation/barley/barleySFP.R>
## The details of denomination of .CEL files in the CEL directory were stated in the 'readme.txt' file under the directory.
##################################################

setwd("/usr/local/data/supplements/sfp/data/rice")
source("../SFP_function.R")
library(affy)
filenames <- sort(list.celfiles("CEL/",pattern='[MZ][345]',full.names=TRUE))

##################################################
## How many probe pairs which the signals of mm probes are larger than pm probes?
##################################################
tmp.probes <-  read.probematrix(filenames=sample(filenames,10),which="both")
range(colMeans(tmp.probes$pm>tmp.probes$mm))
##[1] 0.6564654 0.6849616

##################################################
## What is the distribution of probes per probesets? 
##################################################
load("pmIndex.RData");load("EXPprobesets.RData")
table(table(pmIndex$probeset))
## 
##     8     9    10    11    16    20 
##    71    67    64 57141    14    24 
str(probeset11 <- grep("^Os", names(which(table(pmIndex$probeset)==11)),value=T))
str(probeset11 <- intersect(probeset11,EXPprobesets))
##save(probeset11,file="probeset11.RData",compress=T)

##################################################
## To construct intensities matrix included all of probesets with 11 probes
##################################################
load("all.pm.RData")
riceRNA.pm <- log(all.pm[match(pmIndex$probeset,probeset11,nomatch=0)!=0,],2) # log base2
dd <- as.dist((1 - cor(riceRNA.pm)/2));plot(hclust(dd),main="Normalized PM Intensity",xlab="Chips")
dev.print(pdf,file="images/hclust_by_cor_riceRNA_pm_int.pdf")
rm(all.pm)
gc()
riceRNA.pm <- riceRNA.pm[,order(colnames(riceRNA.pm))]
##save(riceRNA.pm,file= "riceRNApm.RData", compress =T)

##################################################
## To do median polish and get residuals
##################################################
load("riceRNApm.RData")
system.time(pm.medpolishres <- get_medpolish_res(riceRNA.pm,ps.n=11)) ##1 minute
##save(pm.medpolishres,file="pm.medpolishres.RData",compress=T)

##################################################
## To identify SFPs from residuals of median polish
##################################################
load("pm.medpolishres.RData")
library(limma)
t1 <- proc.time()[3]
(cl <- factor(substring(colnames(pm.medpolishres),1,1)))
design <- model.matrix(~0+cl);colnames(design) <- sub('cl','',colnames(design));contrast.matrix <- makeContrasts(Z-M,levels=design)
fit <- lmFit(pm.medpolishres,design)
fit <- contrasts.fit(fit,contrast.matrix)
fit <- eBayes(fit)
summary(fit.data <- decideTests(fit,p=1e-3))
t2 <- proc.time()[3]
(t2-t1)/60 ## 48.6 seconds
##save(fit,file="fit.RData",compress=T)
##save(fit.data,file='fit.data.RData',compress=T)

table(filter <- fit.data!=0)
##  FALSE   TRUE 
## 215765   6655 
(fixnum <- sum(fit.data!=0))
fit.data.mp <- fit.data

str(mysfp <- data.frame(Probename=I(rownames(fit.data)),p.value=fit$p.value[,1],pq.value=p.adjust(fit$p.value[,1],"fdr"),t=fit$t[,1],lfc=fit$coe[,1])[filter,])
##write.table(mysfp,file="rice_sfp.xls",row.name=F,quote=F,sep="\t")
mysfp$Probeset <- sub("--.*",'',mysfp$Probename)
##How is the distribution of SFP per probeset?
(tmp <- table(table(mysfp$Probeset)))
##    1    2    3    4    5    6    7    8    9   10   11 
## 1772  622  267  147   92   65   47   52   25   32   10 
sum(tmp);tmp[1]/sum(tmp);sum(tmp[-(1:5)])/sum(tmp);
## [1] 3131
##         1 
## 0.5659534 
## [1] 0.07377835
##Does tt seem that SFPs in the same probeset tend to be the same genotype?
str(tmp <- intersect(mysfp$Probeset[mysfp$t>0],mysfp$Probeset[mysfp$t<0]))
length(tmp)/sum(table(table(mysfp$Probeset))[-1])
## [1] 0.6637233  ##Cann't find this trend. Need more analysis

##################################################
## Try SAM?
##################################################
require(siggenes)
cl <- factor(substring(colnames(pm.medpolishres),1,1))
system.time(sam.out <- sam(pm.medpolishres,cl,rand = 123,B=500))##50 minutes
##     user   system  elapsed 
##  3046.08   467.20  3618.69 
myq.value <- sam.out@q.value
(thresh <- quantile(myq.value, fixnum/length(myq.value) ))
fit.data <- matrix(rep(0,length(myq.value)))
fit.data <- new("TestResults", array(0, c(length(myq.value),1)))
rownames(fit.data) <- names(myq.value)
fit.data[myq.value<thresh&sam.out@d>0] <- 1
fit.data[myq.value<thresh&sam.out@d<0] <- -1
table(fit.data)
save(fit.data,file="fit.data.sam.RData",compress=T)
fit.data.sam <- fit.data
table(fit.data.mp,fit.data.sam)
##            fit.data.sam
## fit.data.mp     -1      0      1
##          -1   3244     31      0
##          0      23 215711     31
##          1       0     23   3357
table(tmp <- fit.data.mp[fit.data.sam!=0]!=0);mean(tmp) ##More than 99.2% of SFPs detected by SAM are the same as limma
## FALSE  TRUE 
##    54  6601 
## [1] 0.9918858
##################################################
## To test Rostocks' method
##################################################
load("riceRNApm.RData")
load("probeset11.RData")
rice.pm <- matrix(t(riceRNA.pm), nrow = ncol(riceRNA.pm) * 11, ncol = nrow(riceRNA.pm) / 11)
colnames(rice.pm) <- probeset11
fprobe <- factor(rep(paste("p",1:11,sep=""),each = ncol(riceRNA.pm)))
ftissue <- factor(rep(substr(colnames(riceRNA.pm),2,2), time=11))
fgenotype <- factor(rep(substr(colnames(riceRNA.pm),1,1), 11))
tmpdim <- dimnames(riceRNA.pm)
rm(riceRNA.pm);gc()
## Codes from Rostoks
# fit everything  except the SFP probe:genotype interaction
## set up full model as a test
i <- 1
#now record and remove the main effect of gene,tissue,genotype save residuals
res.mat <- matrix(nrow = nrow(rice.pm), ncol = ncol(rice.pm))
t1 <- proc.time()[3]
for (i in 1:ncol(rice.pm)){
# fit everything  except the SFP probe:genotype interaction
  fit <- summary(lm(rice.pm[,i] ~ fprobe + ftissue + fgenotype + ftissue:fgenotype))
  res.mat[,i] <- fit$residuals
  cat("\r",i)
}
t2 <- proc.time()[3]
(t2-t1)/60 ## 21 minutes

res.mat <- matrix(res.mat, nrow=length(tmpdim[[1]]),ncol=length(tmpdim[[2]]),dimnames=tmpdim,byrow=T)
rm(tmpdim)
##save(res.mat,file = "rice.residuals.RData", compress = T)

load("rice.residuals.RData") ## library(siggenes) # load SAM package
## cl <- colnames(res.mat) # model matrix
## system.time(sam.out <-sam(res.mat,cl,rand = 0329,B=500, p0 = .95))# 500 permutations to get a good estimate of 1% or lower FDR ## summary(sam.out,1:5/2)
library(limma)
(cl <- factor(substring(colnames(res.mat),1,1)))
design <- model.matrix(~0+cl);colnames(design) <-c('G1','G2');contrast.matrix <- makeContrasts(G2-G1,levels=design)
fit <- lmFit(res.mat,design)
fit <- contrasts.fit(fit,contrast.matrix)
fit <- eBayes(fit)
##save(fit,file="fit.rostoks.RData",compress=T)
summary(fit.data <- decideTests(fit,p=1e-4))
##save(fit.data,file="fit.data.rostoks.RData",compress=T)
str(rostoks.pqvalue <- p.adjust(fit$p.value[,1],'fdr'))
(thresh <-quantile(rostoks.pqvalue, fixnum/length(rostoks.pqvalue)))
fit.data <- matrix(rep(0,length(rostoks.pqvalue)))
fit.data <- new("TestResults", array(0, dim(fit$t)))
dimnames(fit.data) <- dimnames(fit$t)
fit.data[rostoks.pqvalue<thresh&fit$t>0] <- 1
fit.data[rostoks.pqvalue<thresh&fit$t<0] <- -1
table(fit.data)
##save(fit.data,file="fit.data.rostoks_by_threshold.RData",compress=T)

#########################
load("fit.data.RData")
fit.data.mp <- fit.data
load("fit.data.rostoks_by_threshold.RData")
fit.data.lm <- fit.data
table(fit.data.mp,fit.data.lm)
##            fit.data.lm
## fit.data.mp     -1      0      1
##          -1   2419    854      2
##          0     987 214047    731
##          1       5    864   2511
table(tmp <- rownames(fit.data.mp)[fit.data.mp!=0]%in%rownames(fit.data.lm)[fit.data.lm!=0]);mean(tmp)
## FALSE  TRUE 
##  1718  4937 
## [1] 0.7418482
table(tmp <- rownames(fit.data.lm)[fit.data.lm!=0]%in%rownames(fit.data.mp)[fit.data.mp!=0]);mean(tmp)
fit.data[fit.data.mp!=fit.data.lm] <- NA

##################################################
## To validate SFPs by using oryzaSNP data
##################################################
load("fit.data.RData")
load("uniprobes.RData")
load("sfp.probes.RData")
load("nosfp.probes.RData")
load("EXPprobesets.RData")
str(tmp <- unique(c(nosfp.probes,sfp.probes.mh,sfp.probes.zs)))
str(tmp <- tmp[!tmp%in%intersect(sfp.probes.mh,sfp.probes.zs)]) ##Both targets of MH and ZS have mismatch thus cannot be detected.
str(tmp <- intersect(tmp,uniprobes))
mean(tmp%in%c(sfp.probes.mh,sfp.probes.zs))
table(snp.geno <- sfp.check <- fit.data[rownames(fit.data)%in%tmp,])
snp.geno[snp.geno!=0] <- 0
snp.geno[names(snp.geno)%in%sfp.probes.mh] <- 1
snp.geno[names(snp.geno)%in%sfp.probes.zs] <- -1
##save(snp.geno,file="snp.geno.RData",compress=T)
################

##load("snp.geno.RData")

sfp.check <-  fit.data[match(names(snp.geno),rownames(fit.data)),]
print(y <- table(snp.geno,sfp.check))
print(summary(y))
cat("Sensitivity", sum(c(y[c(1,3),-2]))/sum(c(y[c(1,3),])), "\n" )
cat("False Negative Rate", 1-sum(c(y[c(1,3),-2]))/sum(c(y[c(1,3),])), "\n") 
cat("False positive Rate", sum(c(y[2,-2]))/sum(c(y[2,])), "\n")
cat("False sequence polymorphism Rate", sum(c(y[2,-2]))/sum(c(y[,c(1,3)])), "\n")

str(nosnp.sfp.probes <- names(snp.geno)[snp.geno==0&snp.geno!=sfp.check])
str(snp.sfp.probes <- names(snp.geno)[snp.geno!=0&snp.geno==sfp.check])
##save(nosnp.sfp.probes,file="nosnp.sfp.probes.RData",compress=T)
##save(snp.sfp.probes,file="snp.sfp.probes.RData",compress=T)
load("fit.data.rostoks_by_threshold.RData")
fit.data.lm <- fit.data
table(names(snp.geno)[snp.geno==0&sfp.check==-1]%in%rownames(fit.data)[fit.data==-1])
table(names(snp.geno)[snp.geno==0&sfp.check==1]%in%rownames(fit.data)[fit.data==1])
table(x <- names(snp.geno)[snp.geno==0&sfp.check!=0]%in%rownames(fit.data)[fit.data!=0]);mean(x)
table(x <- nosnp.sfp.probes%in%rownames(fit.data)[fit.data!=0]);mean(x)
## FALSE  TRUE 
##    54    85 
## [1] 0.6115108
##################################################
## the distribution of Residuals
##################################################
load("pm.medpolishres.RData")
load("rice.residuals.RData")
load("fit.data.RData")
hist(tmp <- rowSums(abs(pm.medpolishres)),100)##hist(rowSums(pm.medpolishres),100)
dd <- as.dist((1 - cor(pm.medpolishres[tmp>quantile(tmp,0.9),])/2));plot(hclust(dd),main="Residuals of Median Polish",xlab="Chips")
##dev.print(pdf,file="images/hclust_by_cor_pm.medpolishres_int.pdf")

(cl <- factor(substring(colnames(pm.medpolishres),1,1)))
res.diff1 <- abs(rowMeans(pm.medpolishres[,cl=='M'])-rowMeans(pm.medpolishres[,cl=='Z']))
res.diff2 <- abs(rowMeans(res.mat[,cl=='M'])-rowMeans(res.mat[,cl=='Z']))

tmpb <- seq(0,10,0.01)
hist(pmin(res.diff1,10),tmpb,freq=T,xlim=c(0,1),main="The Distribution of Average Absolute Differences",xlab="Average Absolute Differences Between MH and ZS",col=8)
hist(pmin(res.diff2,10),tmpb,freq=T,add=T,border=2)
legend(locator(1),legend=c('Median polish','Fitting linear model'),fill=c(8,2))
dev.print(pdf,file="images/The Distribution of Average Absolute Differences_MP_vs_LN.pdf")

################What is the distribution of non-polymorphism genotype's signal intensities of SFP probes
##str(subset <- names(sfp.check)[snp.geno==sfp.check&sfp.check==1])
##hist(rowMedians(riceRNA.pm[match(subset,rownames(riceRNA.pm)),grep('^Z',colnames(riceRNA.pm))]),100)
################Conclusion: almost larger than 4.

##################################################
## To make figures for article
##################################################
load("pmIndex.RData")
load("riceRNApm.RData")
source("../SFP_function.R")

x11(width = 7, height = 5.8, pointsize = 12);par(font.main=1,cex.main=1,font.lab=1,cex.lab=1)

load("mysfp.RData")
(tmp <- table(table(sub("at.*",'at',mysfp$Probe))))
tmp/sum(tmp)
par(mfrow=c(1,1))
par(xpd=T)
text(cbind(barplot(tmp <- table(table(sub("at.*",'at',mysfp$Probe))),ylim=c(0,3300),density=30,col=c("lightblue"),xlab="SFPs per ProbeSet",ylab="Number of ProbeSets"),tmp+60),as.character(tmp))#number of SFPs per ProbeSet
dev.print(pdf,file="images/sfp_per_Probeset.pdf")
text(cbind(barplot({tmp <- table(sub(".*at",'',rownames(pmIndex[match(mysfp$Probe,as.character(pmIndex$probename),nomatch=0),])));tmp <- tmp[order(as.numeric(names(tmp)))]},ylim=c(0,1000),density=30,col=c("lightblue"),xlab="Probe Order",ylab="Number of SFPs"),tmp+50),as.character(tmp))
dev.print(pdf,file="images/sfp_Probe_Order.pdf")


##################################################
## Does flanking SNP influence the hybridization of cRNA?
##################################################
load("oryzaSNP.raw.RData")
flstart <- 0;flend <- -9;str(flsfp.probes <- unique(oryzaSNP.raw$Probe_name[((oryzaSNP.raw$Pos_forward<=flstart&oryzaSNP.raw$Pos_forward>=flend)|(oryzaSNP.raw$Pos_backward<=flstart&oryzaSNP.raw$Pos_backward>=flend))&oryzaSNP.raw$MH63!=oryzaSNP.raw$Zhen_Shan_97]))
flstart <- -10;flend <- -19;str(intflsfp.probes <- unique(oryzaSNP.raw$Probe_name[((oryzaSNP.raw$Pos_forward<=flstart&oryzaSNP.raw$Pos_forward>=flend)|(oryzaSNP.raw$Pos_backward<=flstart&oryzaSNP.raw$Pos_backward>=flend))&oryzaSNP.raw$MH63!=oryzaSNP.raw$Zhen_Shan_97]))
str(flsfp.probes <- flsfp.probes[flsfp.probes%in%intersect(nosfp.probes,uniprobes)])
str(intflsfp.probes <- intflsfp.probes[intflsfp.probes%in%intersect(nosfp.probes,uniprobes)])
str(tmp <- intersect(flsfp.probes,intflsfp.probes))
str(flsfp.probes <- flsfp.probes[!flsfp.probes%in%tmp])
str(intflsfp.probes <- intflsfp.probes[!intflsfp.probes%in%tmp])
##save(flsfp.probes,intflsfp.probes,file="flsfp.probes.RData",compress=T)

load("fit.data.RData")
load("fit.RData")
load("nosfp.probes.RData")
load("sfp.probes.RData")
##str(myfit.data <- data.frame(Probename=I(rownames(fit$p.value)),p.value=fit$p.value[,1],pq.value=p.adjust(fit$p.value[,1],"fdr"),t=fit$t[,1],lfc=fit$coefficients[,1]))
##save(myfit.data,file="myfit.data.RData",compress=T)
load("myfit.data.RData")
load("uniprobes.RData")
load("flnosfp.probes.RData")
load("flsfp.probes.RData")
str(psubset <- intersect(c(flnosfp.probes,flsfp.probes,intflsfp.probes),nosfp.probes))
mydata <- myfit.data[match(psubset,myfit.data$Probename,nomatch=0),]
mydata$flSNP <- as.numeric(mydata$Probename%in%flsfp.probes)
mydata$intflSNP <- as.numeric(mydata$Probename%in%intflsfp.probes)
##save(mydata,file="mydata080224.RData",compress=T)
##load("mydata080224.RData")
mean(mydata$flSNP>0);mean(mydata$intflSNP>0)
## [1] 0.01474636
## [1] 0.02752654
filter <- mydata$pq.val<0.05&mydata$t!=0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp)
## FALSE  TRUE 
##   122     9 
## [1] 0.06870229

## FALSE  TRUE 
##   121    10 
## [1] 0.07633588
filter <- mydata$pq.val<0.001&mydata$t!=0;table(tmp <- mydata$flSNP[filter]>0);mean(tmp);table(tmp <- mydata$intflSNP[filter]>0);mean(tmp)
## FALSE  TRUE 
##    30     5 
## [1] 0.1428571

## FALSE  TRUE 
##    33     2 
## [1] 0.05714286
##table(filter <- mydata$pq.val<0.05&mydata$t!=0&(mydata$flSNP>0|mydata$intflSNP>0));tmp <- write.csv(cbind(mydata[filter,],oryzaSNP.raw[match(mydata$Probename[filter],oryzaSNP.raw$Probe_name),]),file="rice_flSNP_probes.csv", row.names = FALSE)
str(tmp.flsnp.sfp <- mydata$Probename[mydata$pq.val<0.001&mydata$t!=0&(mydata$flSNP>0)])

par(mfcol=c(1,2))
brs <- quantile(mydata$pq.value,seq(0.0,1,0.002))
##brs <- seq(0.0,1,0.001)
tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$flSNP[mydata$pq.value<x]>0)}))
ylim <- c(0,0.4)
plot(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="",cex=0.5)
title("Proportion of SFPs with SNPs in flanking 10 bp", cex.main= 0.75)
tmp <- cbind(brs,sapply(brs,function(x){mean(mydata$intflSNP[mydata$pq.value<x]>0)}))
plot(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="",cex=0.5)
title("Proportion of SFPs with SNPs in flanking 11-20 bp", cex.main= 0.75)
##lines(tmp,type='o',ylim=ylim,ylab="Proportion of SFPs with flanking SNPs",xlab="BH adjusted p-value",main="Proportion of SFPs with SNPs in flanking 10-20 bp",cex=0.5,col=8)
dev.print(pdf,file="images/rice_flSNP_por.pdf",width=8, height=6)


##################################################
## Fisher exact test and Chi test
##################################################
load("fit.data.RData")
x<-table(mydata$flSNP[mydata$Probename%in%rownames(fit.data)[fit.data!=0]]>0);p <- table(mydata$flSNP);p <- p/sum(p)
cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p))
cat("Fold: ",(x[2]/sum(x))/p[2],"\n")
x<-table(mydata$intflSNP[mydata$Probename%in%rownames(fit.data)[fit.data!=0]]>0);p <- table(mydata$intflSNP);p <- p/sum(p)
cbind(p,x,x/sum(x));(rchi <- chisq.test(x, p = p))
cat("Fold: ",(x[2]/sum(x))/p[2],"\n")

thread <- 0.001;x.t <- cbind(table(mydata$flSNP[mydata$pq.val>=thread&mydata$t!=0]>0),table(mydata$flSNP[mydata$pq.val<thread&mydata$t!=0]>0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t
fisher.test(x.t,alter='great')
x.t <- cbind(table(mydata$intflSNP[mydata$pq.val>=thread&mydata$t!=0]>0),table(mydata$intflSNP[mydata$pq.val<thread&mydata$t!=0]>0));dimnames(x.t) <- list(c('NoSNP','SNP'),paste(c('>','<'),thread,sep=''));x.t
fisher.test(x.t,alter='great')


################################################################

##################################################
## Does flsnp influence SFPs detected by using single tissue?
##################################################
library(limma)
load("riceRNApm.RData")
load("pmIndex.RData")
load('EXPprobesets.RData');load("uniprobes.RData");


for(stage in 3:5){
  cat("STAGE",stage,"\n",sep="\t")
  filter.str <- paste('[MZ]',stage,sep='')
  filter.cols <- grep(filter.str,colnames(riceRNA.pm))
  filter.rows <- T
  pm.medpolishres <- get_medpolish_res(riceRNA.pm[filter.rows,filter.cols],ps.n=11)
  ##save(pm.medpolishres,file=paste('pm.medpolishres_stage',stage,'.RData',sep=''),compress=T)
  ##load(paste('pm.medpolishres_stage',stage,'.RData',sep=''))
  str(pm.medpolishres)
  cl <- factor(substring(colnames(pm.medpolishres),1,1))
  design <- model.matrix(~0+cl);colnames(design) <- c('G1','G2');contrast.matrix <- makeContrasts(G2-G1,levels=design)
  fit <- do_limma(pm.medpolishres,design,contrast.matrix)
  fit <- eBayes(fit) 
  save(fit,file=paste('fit_stage',stage,'.RData',sep=''),compress=T)
  show(summary(fit.data <- decideTests(fit,p=0.02)))
  ##save(fit.data,file=paste('fit.data_stage',stage,'.RData',sep=''),compress=T)
  table(filter <- fit.data!=0)
  mysfp <- data.frame(Probename=I(rownames(fit.data)),p.value=fit$p.value[,1],t=fit$t[,1],lfc=fit$coe[,1])[filter,]
  mysfp$Probeset <- sub('-.*','',mysfp$Probename)
  ##save(mysfp,file=paste('mysfp_stage',stage,'.RData',sep=''),compress=T)
}
  
str(mysfp <- mysfp[mysfp$Probeset%in%EXPprobesets,])
fit.data[!rownames(fit.data)%in%uniprobes] <- 0
summary(fit.data)


##How is the distribution of SFP per probeset?
(tmp <- table(table(mysfp$Probeset)))
sum(tmp);tmp[1]/sum(tmp)


##################################################
## To get the same numbers of 6,655 SFPs
##################################################
fixnum <- 6655
for(stage in 3:5){
  cat("STAGE",stage,"\n",sep="\t")
  load(paste('fit_stage',stage,'.RData',sep=''))
  str(my.pqvalue <- p.adjust(fit$p.value[,1],'fdr'))
  (thresh <-quantile(my.pqvalue, fixnum/length(my.pqvalue)))
  fit.data <- matrix(rep(0,length(my.pqvalue)))
  fit.data <- new("TestResults", array(0, dim(fit$t)))
  dimnames(fit.data) <- dimnames(fit$t)
  fit.data[my.pqvalue<thresh&fit$t>0] <- 1
  fit.data[my.pqvalue<thresh&fit$t<0] <- -1
  print(summary(fit.data))
  save(fit.data,file=paste('fit.data_stage',stage,'_by_threshold.RData',sep=''),compress=T)
}


################################################################
load('snp.geno.RData')
load("nosnp.sfp.probes.RData")
load("fit.data.RData")
fit.data.mp <- fit.data
sfpprobes.mp <- rownames(fit.data)[fit.data!=0]
nosnp.sfp.con <- rep(0,length(nosnp.sfp.probes));names(nosnp.sfp.con) <- nosnp.sfp.probes
sfpprobes.tis <- NULL

for(stage in 3:5){
  cat("STAGE",stage,"\n",sep="\t")
  load(paste('fit.data_stage',stage,'_by_threshold.RData',sep=''))
  print(summary(fit.data))
  print(table(fit.data,fit.data.mp))
  cat("corelation: ",sum(diag(table(fit.data,fit.data.mp)))/nrow(fit.data),"\t",mean(sfpprobes.mp%in%rownames(fit.data)[fit.data!=0]),"\n")
  print(table(fit.data[rownames(fit.data)%in%nosnp.sfp.probes]))
  cat("corelation with ALL non-polymorphic SFP: ",mean(tmpfilter <- nosnp.sfp.probes%in%rownames(fit.data)[fit.data!=0]),"\n")
  nosnp.sfp.con[tmpfilter] <- nosnp.sfp.con[tmpfilter]+1
  cat("corelation with ALL polymorphic SFP: ",mean(snp.sfp.probes%in%rownames(fit.data)[fit.data!=0]),"\n")
  sfpprobes.tis <- union(sfpprobes.tis,rownames(fit.data)[fit.data!=0])
  ##cat("corelation with ALL flanking polymorphic SFP: ",mean(tmp.flsnp.sfp%in%rownames(fit.data)[fit.data!=0]),"\n")
  sfp.check <- fit.data[rownames(fit.data)%in%names(snp.geno),]
  print(y <- table(snp.geno,sfp.check))
  print(summary(y))
  cat("Sensitivity", sum(c(y[c(1,3),-2]))/sum(c(y[c(1,3),])), "\n" )
  cat("False Negative Rate", 1-sum(c(y[c(1,3),-2]))/sum(c(y[c(1,3),])), "\n") 
  cat("False positive Rate", sum(c(y[2,-2]))/sum(c(y[2,])), "\n")
  cat("False sequence polymorphism Rate", sum(c(y[2,-2]))/sum(c(y[,c(1,3)])), "\n")
##   cat("Sensitivity", sum(y[1,1]+y[3,3])/sum(y[c(1,3),]), "\n" )
##   cat("False sequence polymorphism Rate", sum(y[2:3,1]+y[1:2,3])/sum(y[,c(1,3)]), "\n")
##   cat("False negative Rate", sum(y[2,c(1,3)])/sum(y[2,]), "\n")
}

mean(sfpprobes.mp%in%sfpprobes.tis)
##save(nosnp.sfp.con,file="nosnp.sfp.con.RData",compress=T)
(table(tmp <- nosnp.sfp.con>0));mean(tmp)