## by linear models probe by genotype interactions.
#Genome-wide single nucleotide polymorphism discovery in barley GeneChip transcription profiling data 

#Nils Rostoks 1, Justin O. Borevitz 2, Arnis Druka 1, Sharon Mudie 1, Linda Milne 1, David F. Marshall 1, Robbie Waugh 1*

#generat sample concept plot
## 11 features, 2 tissues, 2 genotypes, 3 replicates
probe.effect <- rnorm(11)
tis.effect <- 0.5
rep.error <- 0.2
geno.effect <- 1
sfp.probe <- 9
samp.mat <- matrix(probe.effect,nr = 11, nc = 2*2*3)
samp.mat <- samp.mat + rnorm(length(samp.mat),sd = rep.error)
samp.mat[,1:6] <- samp.mat[,1:6] + tis.effect
samp.mat[-sfp.probe,c(1:3,7:9)] <- samp.mat[-sfp.probe,c(1:3,7:9)] + geno.effect
matplot(samp.mat,lty = rep(c(1,2),each = 6), col = rep(c(1,2),each =3),type = "l")


#install bioconductor if not already done, use..
#source("http://bioconductor.org/getBioC.R")
#getBioC("affy")
#download http://www.bioconductor.org/data/probes/Packages/barley1probe_1.0.zip
# and unzip into library folder
library(affy) 
#library(barley1probe)
#library(affyPLM)
setwd("c:/barley/CEL")
gDNA.batch <- ReadAffy() # read all .CEL files in current direction, alphabetical order
# will download barleycdf package
gDNA.batch <- bg.correct.rma2(gDNA.batch) #background correct
#gDNA.batch <- bg.correct.LESN(gDNA.batch) #background correct new Bolstad methods Low End Signal Noise
gDNA.batch <- normalize.AffyBatch.quantiles(gDNA.batch, type = "pmonly") # quantile normalize

#look at 22801 probesets with 11 probes each
barley.probeNames <- probeNames(gDNA.batch)
table(table(barley.probeNames))
#    8     9    10    11    12    20 
#    1     4     9 22801     1    24 
##linux sorts by case sensetive aphabetical while Windows does not
probeset11 <- names(which(table(barley.probeNames)==11))

indexBprobes <- indexProbes(gDNA.batch,which = "pm", genenames = probeset11)
pm.i.xy <- indices2xy(unlist(indexBprobes), abatch = gDNA.batch)
barley.names11xy <- apply(cbind(rownames(pm.i.xy),pm.i.xy),1,paste,collapse = "--")

#extract perfect match values to a matrix
gDNA.pm <- log(pm(gDNA.batch,probeset11),2) #log base2 for fold change in SAM

# test naming order
#i <- 10000 # pick a probe to test
#barley.names11xy[i]
#ptest <- indexProbes(gDNA.batch,which = "pm", genenames = "Contig10812_at")
#ndices2xy(unlist(ptest), abatch = gDNA.batch)
#gDNA.pm[i,]
#log(pm(gDNA.batch,"Contig10812_at"))

library(siggenes) # load SAM package
cl <- c(1,1,1,2,2,2,3,3,3) # model matrix
system.time(sam.out <- sam(gDNA.pm,cl,rand = 123, gene.names = barley.names11xy,p0= 0.95))
summary(sam.out,1:5)
#SAM Analysis for the Multi-Class Case with 3 Classes 
# s0 = 0.0123  (The 25 % quantile of the s values.) 
# Number of permutations: 100  
# MEAN number of falsely called genes is computed.
#   Delta      p0   False Called     FDR cutlow    cutup j2     j1
#1      1 0.92018 2072.58   4017 0.47477   -Inf  5.02720  0 246795
#2      2 0.92018  582.60   1728 0.31024   -Inf  7.27272  0 249084
#3      3 0.92018  257.83   1090 0.21766   -Inf  9.03942  0 249722
#4      4 0.92018  138.75    789 0.16182   -Inf 10.63036  0 250023
#5      5 0.92018   86.42    631 0.12602   -Inf 12.06848  0 250181

#par(mfrow=c(1,2))
jpeg("barleySAMplot.jpg")
plot(sam.out,1:20)
dev.off()
jpeg("barleySAMplotDelta3.jpg")
plot(sam.out,3)
dev.off()

gDNAsfp <- summary(sam.out,3,ll=F,file = "barleygDNAsfp.csv",sep = ",",quote=F) #19 header rows
#write.table(sigSFP$mat.sig,file = "barleygDNAsfp.csv",sep = ",")

## try Efron EBAM to estimat a0 in denoinator
#find.out <- find.a0(gDNA.pm, cl, rand = 123) # not working for 3 classes maybe in 1.6BioC
#ebam.out <- ebam(find.out, gene.names = barley.names11xy)

#ebam.wilc.out <- ebam.wilc(gDNA.pm,cl, rand = 123, gene.names = barley.names11xy)# not working for 3 classes maybe in 1.6BioC

save(sam.out, file = "barleygDNAsam.out.RData",compress = T)


## now run the RNA 36 files, fresh start

library(affy)
setwd("c:/barley/CELrna")
cels <- list.celfiles()
chips <- unlist(strsplit(unlist(cels),"\\)"))[seq(1,71,2)]
chips <- unlist(strsplit(unlist(cels),"\\("))[seq(2,72,2)]
temp <- unlist(strsplit(chips,"_"))[-seq(4,144,4)]
pheno <- matrix(temp[-seq(3,108,3)],nc=2,byrow=T)
covN <- list(tissue = "tissue", genotype = "genotype")
pD <- new("phenoData", pData = as.data.frame(pheno), varLabels = covN)

barley.object <- read.affybatch(filenames = cels, phenoData = pD)
#pData(barley.object)

#may need to be root user for instally of barley1cdf
barley.object <- bg.correct(barley.object,method = "rma2")
barley.object <- normalize.AffyBatch.quantiles(barley.object, type = "pmonly")

barley.probeNames <- probeNames(barley.object)
table(table(barley.probeNames))
probeset11 <- names(which(table(barley.probeNames)==11))

indexBprobes <- indexProbes(barley.object,which = "pm", genenames = probeset11)
pm.i.xy <- indices2xy(unlist(indexBprobes), abatch = barley.object)
barley.names11xy <- apply(cbind(rownames(pm.i.xy),pm.i.xy),1,paste,collapse = "--")
#save(barley.names11xy,file = "probeNames.RData",compress=T)

#extract perfect match values to a matrix
barleyRNA.pm <- log(pm(barley.object,probeset11),2) # log base2 for fold change in SAM
rownames(barleyRNA.pm) <- barley.names11xy
barleyRNA.pm <- barleyRNA.pm[,order(pheno[,1],pheno[,2])]
pheno <- pheno[order(pheno[,1],pheno[,2]),]
#save(barleyRNA.pm,file= "barleyRNApm.RData", compress =T)
jpeg(file = "barleyRNAcorimage.jpg")
par(mar=c(5, 5, 4, 2))
image(1:36,1:36,cor(barleyRNA.pm),xaxt = "n", yaxt = "n", xlab = "", ylab = "", col= heat.colors(256))
# possible missclassification
#0404-29(ROO_GP_B_29).CEL may really be 0404-29(ROO_MX_B_29).CEL
chip.names <- apply(pheno,1,paste,collapse="")
for (i in 1:5){
mtext(substring(chip.names, first = i, last = i),
        side = 1, at = 1:36, line = i-1, cex = 0.75)
mtext(substring(chip.names, first = i, last = i),
        side = 2, at = 1:36, line = 5-i, cex = 0.75)
}
abline(v=31 +c(-0.5,0.5))
abline(h=31 +c(-0.5,0.5))
dev.off()

#have a look without LEAves which are quite different
jpeg(file = "barleyRNAcorimage-LEA.jpg")
par(mar=c(5, 5, 4, 2))
image(1:30,1:30,cor(barleyRNA.pm[,-(19:24)]),xaxt = "n", yaxt = "n", xlab = "", ylab = "", col= heat.colors(256))
# possible missclassification
#0404-29(ROO_GP_B_29).CEL may really be 0404-29(ROO_MX_B_29).CEL
chip.names <- apply(pheno[-(19:24),],1,paste,collapse="")
for (i in 1:5){
mtext(substring(chip.names, first = i, last = i),
        side = 1, at = 1:30, line = i-1, cex = 0.75)
mtext(substring(chip.names, first = i, last = i),
        side = 2, at = 1:30, line = 5-i, cex = 0.75)
}
abline(h=25 +c(-0.5,0.5))
abline(v=25 +c(-0.5,0.5))
dev.off()


## format to account for gene,tissue,genotype effects

barley.pm <- matrix(t(barleyRNA.pm), nrow = 36 * 11, ncol = 22801)
colnames(barley.pm) <- probeset11
barley.pm <- as.data.frame(barley.pm)
barley.pm$probe <- factor(rep(paste("p",1:11,sep=""),each = 36))
barley.pm$tissue = factor(rep(pheno[,1], 11))
barley.pm$genotype = factor(rep(pheno[,2], 11) )

save(barley.pm,file= "barleyRNApm.RData", compress =T)

## set up full model as a test
i <- 1
#now record and remove the main effect of gene,tissue,genotype save residuals
res.mat <- matrix(nrow = 396, ncol = 22801)
coef.array <- array(dim = c(11,4,22801))
t1 <- proc.time()[3]
for (i in 1:22801){
# fit everything  except the SFP probe:genotype interaction
fit <- summary(lm(barley.pm[,i] ~ probe + tissue + genotype + tissue:genotype, data = barley.pm))
res.mat[,i] <- fit$residuals
coef.array[,,i] <- fit$coefficients[-(1:11),] #drop the fixed probe effects
}
t2 <- proc.time()[3]
(t2-t1)/60 ## 127 minutes for naturalj2

res.mat <- t(matrix(res.mat,nr=36))
colnames(res.mat) <- pheno[,2]
rownames(res.mat) <- barley.names11xy
tisP <-  t(coef.array[,4,])
dimnames(tisP) <- list(names(barley.pm)[1:22801], dimnames(fit$coefficients)[[1]][12:22])
tisF <-  t(coef.array[,1,])
dimnames(tisF) <- list(names(barley.pm)[1:22801], dimnames(fit$coefficients)[[1]][12:22])
write.table(tisP,file = "TissueGenoExpressionPval.csv", sep = ",")
write.table(tisF,file = "TissueGenoExpressionFold.csv", sep = ",")

save(res.mat,file = "barley.residuals.RData", compress = T)
save(coef.array,file = "barley.coefarray.RData", compress = T)

load("barley.residuals.RData")
library(siggenes) # load SAM package
cl <- colnames(res.mat) # model matrix
system.time(sam.out <- sam(res.mat,cl,rand = 123,B=500, p0 = .95))# 500 permutations to get a good estimate of 1% or lower FDR
# test 6 arrays for  comparision with gDNA power
#system.time(sam.out <- sam(res.mat[,1:6],cl[1:6],rand = 123,B=100, p0 = .95))
#system.time(sam.out <- sam(res.mat[,seq(1,36,4)],cl[seq(1,36,4)],rand = 123,B=100, p0 = .95))
# 302.1 seconds for 100 on naturalj2
summary(sam.out,1:5/2)

#SAM Analysis for the Two-Class Unpaired Case Assuming Unequal Variances 
# s0 = 0.0342  (The 5 % quantile of the s values.) 
# Number of permutations: 500  
# MEAN number of falsely called genes is computed.
#  Delta   p0    False Called     FDR   cutlow   cutup    j2     j1
#1   0.5 0.95 5883.776  27159 0.20581 -1.60497 1.56704 12837 236490
#2   1.0 0.95  594.278  17744 0.03182 -2.23165 2.21335  8635 241703
#3   1.5 0.95   64.574  13285 0.00462 -2.81474 2.80484  6562 244089
#4   2.0 0.95    6.998  10504 0.00063 -3.37739 3.38004  5301 245609
#5   2.5 0.95    0.462   8583 0.00005 -3.92994 3.94168  4397 246626

jpeg("barleyRNASAMplot.jpg")
plot(sam.out,1:20/10)
dev.off()
#jpeg("barleySAMRNAplotDelta2.jpg")
pdf("barleySAMRNAplotDelta2.pdf", width = 11, height = 6,)
par(mfrow = c(1,2))
plot(sam.out,2)
mtext("A",adj=0,line = 2)
hist(pmin(pmax(sam.out@d,-10),10),breaks = 100, main = "Histogram of d-statistics",xlab = "d-statistics")
abline(v = c(-3.377,3.38),lty = 2)
mtext("B",adj=0,line = 2)
dev.off()

rnasfp <- summary(sam.out,2,ll=F,file = "barleyRNAsfp.csv",sep = ",",quote=F) #19 header rows
save(sam.out, file = "barleyRNAsam.out.RData",compress = T)
sfpName <- matrix(unlist(strsplit(rownames(rnasfp$mat.sig),split = "--")),nr=3)[1,] # split off the --xpos--ypos names
sfpProbeSetName <- matrix(unlist(strsplit(sfpName,split = "_at")),nr=2)[1,] # split off _at probe number
plot(table(table(sfpProbeSetName)))
 
barplot(table(table(sfpProbeSetName)))

table(table(sfpProbeSetName))
#
#   1    2    3    4    5    6    7    8    9   10   11 
#1951  563  262  167  157  123  114  136  108   91   62 
# length(table(sfpProbeSetName))
#[1] 3734 
featSFP <- rownames(rnasfp$mat.sig)

setwd("c:/barley/CELrna")
rnaSFP <- read.csv("barleyRNAsfp.csv",skip=19,as.is=T)
summary(rnaSFP)
sfpName <- matrix(unlist(strsplit(as.character(rnaSFP$Name),split = "--")),nr=3)[1,] # split off the --xpos--ypos names
sfpProbeSetName <- matrix(unlist(strsplit(sfpName,split = "_at")),nr=2)[1,] # split off _at probe number
jpeg("SFPsProbeSet.jpg")
barplot(table(table(sfpProbeSetName)),xlab = "SFPs per ProbeSet", ylab = "Number of SFPs")
dev.off()

# could probesets with 11 SFPs be half up and  half down? otherwise  gene experssion, check dstat sign, also large up  or large down, could use robust measure
sfpSet11 <- names(which(table(sfpProbeSetName)==11))
sfp11tab <- rnaSFP[sfpProbeSetName %in% sfpSet11,]
sfp11tab <-  sfp11tab[order(as.character(sfp11tab$Name)),]
write.table(sfp11tab,file = "sfp11.csv",sep=",",row.names=F)

## try Efron EBAM to estimat a0 in denoinator
#find.out <- find.a0(res.mat, as.numeric(factor(cl)), rand = 123) #not enough memory
#ebam.out <- ebam(res.mat, gene.names = barley.names11xy)


## compare gDNA and RNA sfps, don't expect too many since different genotypes and not much power in gDNA
## still better than random.
library(siggenes)
setwd("c:/barley/CELrna")
load("barleyRNAsam.out.RData") # reorder from Linux
rna.qval <- sam.out@q.value[order(names(sam.out@q.value))]
load("../CEL/barleygDNAsam.out.RData")
gDNA.qval <- sam.out@q.value[order(names(sam.out@q.value))]

table(names(gDNA.qval) == names(rna.qval))
#TRUE 

#test some thresholds
res.mat <- NULL
for (i in 10^(-1:-4))
for (j in 1:4/10)
res.mat <-  c(res.mat,summary(table(gDNA.qval < j,rna.qval < i))$statistic)
print(matrix(res.mat,nc=4,dimnames=list(paste("gDNA",1:4/10),paste("rna",10^(-1:-4)))))
#           rna 0.1  rna 0.01 rna 0.001 rna 1e-04
#gDNA 0.1  39.11611  43.94448  57.57982  43.01975
#gDNA 0.2  95.91104  77.72186  95.60787  85.60262
#gDNA 0.3 107.67837 108.97178 135.67282 123.89319
#gDNA 0.4 133.15521 125.33030 133.02444 115.59271

j <- .3
i <- 0.001
summary(table(gDNA.qval < j,rna.qval < i))
#Number of cases in table: 250811 
#Number of factors: 2 
#Test for independence of all factors:
#        Chisq = 135.67, df = 1, p-value = 2.353e-31

table(gDNA.qval < j,rna.qval < i)
#        FALSE  TRUE  
#  FALSE 238329  10847
#  TRUE    1467    168

write(names(gDNA.qval)[which(gDNA.qval < j & rna.qval < i)],file  = "RNAgDNAsfp.txt")

# go with original thresholds
setwd("c:/barley/CEL")
gDNAsfp <- read.csv("barleygDNAsfp.csv",skip=19,as.is=T)
summary(gDNAsfp)
allProbe <- names(sam.out@q.value)
rSFP <- allProbe %in% rnaSFP$Name
dSFP <- allProbe %in% gDNAsfp$Name 
table(rSFP,dSFP)
#       dSFP
#rSFP    FALSE  TRUE  
#  FALSE 239331    976
#  TRUE   10390    114
summary(table(rSFP,dSFP))
#Number of cases in table: 250811 
#Number of factors: 2 
#Test for independence of all factors:
#        Chisq = 107.28, df = 1, p-value = 3.863e-25

write(allProbe[rSFP & dSFP],file  = "RNAgDNAsfp.txt")


 
# overlap with sequence data
setwd("c:/barley/CELrna")
allSNP <- read.csv("../SNPtable.csv")
allSNP$SNPscore <- as.character(allSNP$SNPscore)
allSNP$SNPscore[grep("\\+",allSNP$SNPscore)] <- 15 # multiple is slot 15
allSNP$snpPos <- as.numeric(allSNP$SNPscore)
table(allSNP$snpPos)
table(table(allSNP$Probe))
#   1    2    3 
#2636   30    1 
allSNP <- allSNP[order(allSNP$Probe),]
temp <- which(duplicated(allSNP$Probe))
# drop multiples to 
allSNP[sort(c(temp,temp-1)),]
allSNP <- allSNP[-temp,]
# drop GP,MX both have changes from probe
allSNP <- allSNP[allSNP$SNPgenotype != "GP,MX",]
summary(allSNP)
save(allSNP,file = "allSNP.RData",compress=T)
load("barleyRNAsam.out.RData") # reorder from Linux

library(siggenes)
sfpName <- matrix(unlist(strsplit(names(sam.out@d),split = "--")),nr=3)[1,] # split off the --xpos--ypos names
probeSNP <- match(as.character(allSNP$Probe),sfpName)
allSNP[x <- sample(length(probeSNP),10),]
sam.out@d[probeSNP][x]

allSNP$samD <- sam.out@d[probeSNP]
allSNP$samQ <- sam.out@q.value[probeSNP]
allSNPsave <- allSNP
for (i in c("EST","Random","targeted")){
cat(i)
allSNP <- allSNPsave[allSNPsave$dataset== i,]
attach(allSNP)

# test a few thresholds
for (k in seq(1,5,.5)){ # range of chisq agrees with FDR go with 3.2 dstat,  2 delta, 0.001 qvalue 3.5e-5 pvalue
print(k)
y <- table(cut(x <- samD,c(max(x),-k,k,min(x))),as.character(SNPgenotype))
print(y[,c(1,3,2)])
print(summary(y))
}
}

allSNP <- allSNPsave 
attach(allSNP)

#mp <- grep("_x|_s",as.character(allSNP$Probe)) # repete probes are working fine

table(as.character(SNPgenotype))[c(1,3,2)] # more GP polymorphisms because MX is mostly the design?
    GP no poly      MX 
    223    2200     178 
# with original threshold 
table( cut(sam.out@d, c(-1000, -3.37739, 3.3800 ,1000)) ) # cutpoint 3.38 is just below 3.38004 to catch flanking SFP
#GP is positive scores as ref is greater stat
#(-1e+03,-3.38]   (-3.38,3.38]   (3.38,1e+03] 
#          5301         240307           5203 
geno <-  cut(samD, c(-1000, -3.37739, 3.3800 ,1000), labels = c("mxSFP","nonSFP","gpSFP") )
y <- table(as.character(SNPgenotype), geno) 
print(summary(y))
#Number of cases in table: 2601 
#Number of factors: 2 
#Test for independence of all factors:
#        Chisq = 2049.2, df = 4, p-value = 0
y <- y[c(2,3,1),]
print(y)        
#         geno
#          mxSFP nonSFP gpSFP
#  MX        115     45    18
#  no poly    27   2045   128
#  GP          7     61   155
cat("Sensitivity", sum(y[1,1]+y[3,3])/sum(y[c(1,3),]), "\n" )
#Sensitivity 0.6733167 
cat("False sequence polymorphism Rate", sum(y[2:3,1]+y[1:2,3])/sum(y[,c(1,3)]), "\n")
#False sequence polymorphism Rate 0.4 
# % variance explained
summary(lm(samD~SNPgenotype))
# 38%

# absolute value but contains misscalls
thresh <- quantile(sam.out@q.value, 10504/length(sam.out@q.value) )
table(sam.out@q.value <  thresh)
# FALSE   TRUE 
#240307  10504 
table(SNPgenotype!="no poly", samQ < thresh )    
#        FALSE TRUE
#  FALSE 2045   155
#  TRUE   106   295
summary(table(SNPgenotype!="no poly", samQ < thresh ))
#Number of cases in table: 2601 
#Number of factors: 2 
#Test for independence of all factors:
#        Chisq = 1049, df = 1, p-value = 4.055e-230

offGP <- which(geno == "mxSFP" & SNPgenotype == "GP")
offMX <- which(geno == "gpSFP" & SNPgenotype == "MX")

y <- table(allSNP$snpPos[-c(offGP,offMX)], geno[-c(offGP,offMX)])
y
#     mxSFP nonSFP gpSFP
#  1   2    10      1   
#  2   1    10      1   
#  3   2    13      4   
#  4   8     8      5   
#  5   6     7     11   
#  6  14     5     18   
#  7  10     8     14   
#  8   5     1     15   
#  9   5    10     11   
#  10 12     7     10   
#  11  4     6     18   
#  12  9     4     20   
#  13  8     2      9   
#  15 15     5      9   
# getS3method("plot","table")
rownames(y)[14] <- "Multiple"
pdf("SNPpositionSFPcall.pdf")
mosaicplot(y, # expect to see middle group shrink as SFP calling  improves above 4
      main= "SNP position vs SFP call", ylab = "SFP call", xlab = "SNP position")
cex <- 0.6
mtext(y[,1],side = 1,line = -1,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex)
mtext(y[,2],side = 1,line = -11,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex)
mtext(y[,3],side = 1,line = -20,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex)
barplot(t(y))
dev.off()
# plot some raw data .pdf's to look at SFPs

setwd("c:/barley/CELrna")
library(barley1probe)
data(barley1probe)
load("C:/barley/CELrna/barleyRNApm.RData")
sfpSet <- paste(names(table(sfpProbeSetName)),"at",sep="_")
tis <- as.numeric(barley.pm$tissue)
gen <- as.numeric(barley.pm$genotype)
prob <- as.numeric(barley.pm$probe)

pdf(file = "barleyRNAbpSFPset.pdf")
par(mfrow = c(2,3))
for (i in 1:100){# first so many genes with SFPs
#i <-  13
probeSet <- sfpSet[i]
# show bp?
xpos <- barley1probe$Probe.Interrogation.Position[barley1probe$Probe.Set.Name == probeSet]
#xpos <- 1:11
gene.plot <- barley.pm[,probeSet]
fit <- lm(gene.plot ~ probe + tissue + genotype + tissue:genotype
                        # + probe:genotype # leave this off for the main effect fits
                          ,      data = barley.pm)
probe.effect <- c(0,fit$coefficients[2:11])[prob]
tissue.effect <- c(0,fit$coefficients[12:16])[tis]
geno.effect <- c(0,fit$coefficients[17])[gen]
tissue.geno <- c(0,fit$coefficients[18:22])[tis]
tissue.geno[gen == 1] <- 0 # coefs only effect mx genotype
# raw data
matplot(xpos, t(matrix(gene.plot,nr = 36)),
         type = "l", col = gen, lty = tis,
         xlab = probeSet, ylab = "log intensity", main = "raw data")
# remove probe
matplot(xpos, t(matrix(gene.plot - probe.effect,nr = 36)),
         type = "l", col = gen, lty = tis,
         xlab = probeSet, ylab = "log intensity", main = "- probe")
# remove probe and tissue
matplot(xpos, t(matrix(gene.plot - probe.effect - tissue.effect,nr = 36)), 
         type = "l", col = gen, lty = tis,
         xlab = probeSet, ylab = "log intensity", main = "-probe - tissue")
# remove probe and genotype
matplot(xpos, t(matrix(gene.plot - probe.effect - geno.effect,nr = 36)), 
         type = "l", col = gen, lty = tis, main = "- probe - geno",
         xlab = probeSet, ylab = "log intensity")
# remove probe and genotype and tissue
matplot(xpos, t(matrix(gene.plot - probe.effect - geno.effect - tissue.effect,nr = 36)), 
         type = "l", col = gen, lty = tis, main = "- probe - geno - tissue",
         xlab = probeSet, ylab = "log intensity")
# remove probe and genotype and tissue and tissue*geno
matplot(xpos,t(matrix(gene.plot - probe.effect - geno.effect - tissue.effect - tissue.geno, nr = 36)),
         type = "l", col = gen, lty = tis, main = "- probe - geno - tissue - geno*tissue",
         xlab = probeSet, ylab = "log intensity",cex.main = 0.6 )
cat(probeSet,"\n")
}# close loop for genes
dev.off() # close .pdf 





####### test certain tissues as a power question
# with 1 tissue there can be general genotype gene expression differences and probe level differences SFPs
setwd("c:/barley/CELrna")
load("barleyRNApm.RData")
table(barley.pm$genotype,barley.pm$tissue)
#     COL CRO GEM LEA RAD ROO
#  GP 33  33  33  33  33  22 
#  MX 33  33  33  33  33  44  #arrays*11 probes
tis <- names(table(barley.pm$tissue))

load("C:/barley/allSNP.RData")
sfpName <- paste(rep(names(barley.pm)[1:22801],each=11),1:11,sep="")
#load("C:/barley/CELrna/probeNames.RData")
#sfpName <- matrix(unlist(strsplit(barley.names11xy,split = "--")),nr=3)[1,] # split off the --xpos--ypos names
probeSNP <- match(as.character(allSNP$Probe),sfpName)
cbind(allSNP[1000:1010,],sfpName[probeSNP][1000:1010]) #check naming match


fdr.tab <- matrix(nr=3,ncol = length(tis))
rownames(fdr.tab) <- c("Sensitivity","False sequence polymorphism rate", "%var")
colnames(fdr.tab) <- tis
library(siggenes) # load SAM package
for (i in 1:6){
barley.pmT <- barley.pm[barley.pm$tissue == tis[i],]

## set up full model as a test
#now record and remove the main effect of gene,tissue,genotype save residuals
res.mat <- lsfit(as.numeric(barley.pmT$genotype),barley.pmT[,1:22801])$residuals
res.mat <- t(matrix(res.mat,nr=6))
colnames(res.mat) <- barley.pmT$genotype[1:6]
print(colnames(res.mat))
cl <- colnames(res.mat) # model matrix
system.time(sam.out <- sam(res.mat,cl,rand = 123,B=500, p0 = .95))# can only do 20 perms

#compare with SNP table
allSNP$samD <- sam.out@d[probeSNP]
allSNP$samQ <- sam.out@q.value[probeSNP]
attach(allSNP)

# choose same thresholds as for 36 arrays
cuts <- quantile(sam.out@d,c(0, 5301, length(sam.out@d)-5204, length(sam.out@d))/length(sam.out@d))
table(cut(sam.out@d,cuts,right=F))
geno <-  cut(samD, cuts, labels = c("mxSFP","nonSFP","gpSFP"),right=F )
y <- table(as.character(SNPgenotype), geno) 
print(tis[i])
print(summary(y))
y <- y[c(2,3,1),]
print(y)
fdr.tab[1,i] <- sum(y[1,1]+y[3,3])/sum(y[c(1,3),])
fdr.tab[2,i] <- sum(y[2:3,1]+y[1:2,3])/sum(y[,c(1,3)])
fdr.tab[3,i] <- summary(lm(samD~SNPgenotype))$r.squared
}# close tis loop

write.table(fdr.tab,file = "fdr.tissue.csv", sep = ",")

















## use same threshold as with 36 arrays 
thresh <- quantile(sam.out@q.value, 10504/length(sam.out@q.value) )
table(sam.out@q.value <  thresh)    
table(SNPgenotype!="no poly", samQ < thresh )    
#       FALSE TRUE
#  FALSE 2029    75
#  TRUE   148   177
summary(table(SNPgenotype!="no poly", samQ < thresh ))
#Number of cases in table: 2429 
#Number of factors: 2 
#Test for independence of all factors:
#        Chisq = 784.3, df = 1, p-value = 1.401e-172
summary(lm(SNPgenotype!="no poly" ~ samQ < thresh )) #33% r2

summary(lm(log(samQ) ~ SNPgenotype!="no poly"))