## by linear models probe by genotype interactions. #Genome-wide single nucleotide polymorphism discovery in barley GeneChip transcription profiling data #Nils Rostoks 1, Justin O. Borevitz 2, Arnis Druka 1, Sharon Mudie 1, Linda Milne 1, David F. Marshall 1, Robbie Waugh 1* #generat sample concept plot ## 11 features, 2 tissues, 2 genotypes, 3 replicates probe.effect <- rnorm(11) tis.effect <- 0.5 rep.error <- 0.2 geno.effect <- 1 sfp.probe <- 9 samp.mat <- matrix(probe.effect,nr = 11, nc = 2*2*3) samp.mat <- samp.mat + rnorm(length(samp.mat),sd = rep.error) samp.mat[,1:6] <- samp.mat[,1:6] + tis.effect samp.mat[-sfp.probe,c(1:3,7:9)] <- samp.mat[-sfp.probe,c(1:3,7:9)] + geno.effect matplot(samp.mat,lty = rep(c(1,2),each = 6), col = rep(c(1,2),each =3),type = "l") #install bioconductor if not already done, use.. #source("http://bioconductor.org/getBioC.R") #getBioC("affy") #download http://www.bioconductor.org/data/probes/Packages/barley1probe_1.0.zip # and unzip into library folder library(affy) #library(barley1probe) #library(affyPLM) setwd("c:/barley/CEL") gDNA.batch <- ReadAffy() # read all .CEL files in current direction, alphabetical order # will download barleycdf package gDNA.batch <- bg.correct.rma2(gDNA.batch) #background correct #gDNA.batch <- bg.correct.LESN(gDNA.batch) #background correct new Bolstad methods Low End Signal Noise gDNA.batch <- normalize.AffyBatch.quantiles(gDNA.batch, type = "pmonly") # quantile normalize #look at 22801 probesets with 11 probes each barley.probeNames <- probeNames(gDNA.batch) table(table(barley.probeNames)) # 8 9 10 11 12 20 # 1 4 9 22801 1 24 ##linux sorts by case sensetive aphabetical while Windows does not probeset11 <- names(which(table(barley.probeNames)==11)) indexBprobes <- indexProbes(gDNA.batch,which = "pm", genenames = probeset11) pm.i.xy <- indices2xy(unlist(indexBprobes), abatch = gDNA.batch) barley.names11xy <- apply(cbind(rownames(pm.i.xy),pm.i.xy),1,paste,collapse = "--") #extract perfect match values to a matrix gDNA.pm <- log(pm(gDNA.batch,probeset11),2) #log base2 for fold change in SAM # test naming order #i <- 10000 # pick a probe to test #barley.names11xy[i] #ptest <- indexProbes(gDNA.batch,which = "pm", genenames = "Contig10812_at") #ndices2xy(unlist(ptest), abatch = gDNA.batch) #gDNA.pm[i,] #log(pm(gDNA.batch,"Contig10812_at")) library(siggenes) # load SAM package cl <- c(1,1,1,2,2,2,3,3,3) # model matrix system.time(sam.out <- sam(gDNA.pm,cl,rand = 123, gene.names = barley.names11xy,p0= 0.95)) summary(sam.out,1:5) #SAM Analysis for the Multi-Class Case with 3 Classes # s0 = 0.0123 (The 25 % quantile of the s values.) # Number of permutations: 100 # MEAN number of falsely called genes is computed. # Delta p0 False Called FDR cutlow cutup j2 j1 #1 1 0.92018 2072.58 4017 0.47477 -Inf 5.02720 0 246795 #2 2 0.92018 582.60 1728 0.31024 -Inf 7.27272 0 249084 #3 3 0.92018 257.83 1090 0.21766 -Inf 9.03942 0 249722 #4 4 0.92018 138.75 789 0.16182 -Inf 10.63036 0 250023 #5 5 0.92018 86.42 631 0.12602 -Inf 12.06848 0 250181 #par(mfrow=c(1,2)) jpeg("barleySAMplot.jpg") plot(sam.out,1:20) dev.off() jpeg("barleySAMplotDelta3.jpg") plot(sam.out,3) dev.off() gDNAsfp <- summary(sam.out,3,ll=F,file = "barleygDNAsfp.csv",sep = ",",quote=F) #19 header rows #write.table(sigSFP$mat.sig,file = "barleygDNAsfp.csv",sep = ",") ## try Efron EBAM to estimat a0 in denoinator #find.out <- find.a0(gDNA.pm, cl, rand = 123) # not working for 3 classes maybe in 1.6BioC #ebam.out <- ebam(find.out, gene.names = barley.names11xy) #ebam.wilc.out <- ebam.wilc(gDNA.pm,cl, rand = 123, gene.names = barley.names11xy)# not working for 3 classes maybe in 1.6BioC save(sam.out, file = "barleygDNAsam.out.RData",compress = T) ## now run the RNA 36 files, fresh start library(affy) setwd("c:/barley/CELrna") cels <- list.celfiles() chips <- unlist(strsplit(unlist(cels),"\\)"))[seq(1,71,2)] chips <- unlist(strsplit(unlist(cels),"\\("))[seq(2,72,2)] temp <- unlist(strsplit(chips,"_"))[-seq(4,144,4)] pheno <- matrix(temp[-seq(3,108,3)],nc=2,byrow=T) covN <- list(tissue = "tissue", genotype = "genotype") pD <- new("phenoData", pData = as.data.frame(pheno), varLabels = covN) barley.object <- read.affybatch(filenames = cels, phenoData = pD) #pData(barley.object) #may need to be root user for instally of barley1cdf barley.object <- bg.correct(barley.object,method = "rma2") barley.object <- normalize.AffyBatch.quantiles(barley.object, type = "pmonly") barley.probeNames <- probeNames(barley.object) table(table(barley.probeNames)) probeset11 <- names(which(table(barley.probeNames)==11)) indexBprobes <- indexProbes(barley.object,which = "pm", genenames = probeset11) pm.i.xy <- indices2xy(unlist(indexBprobes), abatch = barley.object) barley.names11xy <- apply(cbind(rownames(pm.i.xy),pm.i.xy),1,paste,collapse = "--") #save(barley.names11xy,file = "probeNames.RData",compress=T) #extract perfect match values to a matrix barleyRNA.pm <- log(pm(barley.object,probeset11),2) # log base2 for fold change in SAM rownames(barleyRNA.pm) <- barley.names11xy barleyRNA.pm <- barleyRNA.pm[,order(pheno[,1],pheno[,2])] pheno <- pheno[order(pheno[,1],pheno[,2]),] #save(barleyRNA.pm,file= "barleyRNApm.RData", compress =T) jpeg(file = "barleyRNAcorimage.jpg") par(mar=c(5, 5, 4, 2)) image(1:36,1:36,cor(barleyRNA.pm),xaxt = "n", yaxt = "n", xlab = "", ylab = "", col= heat.colors(256)) # possible missclassification #0404-29(ROO_GP_B_29).CEL may really be 0404-29(ROO_MX_B_29).CEL chip.names <- apply(pheno,1,paste,collapse="") for (i in 1:5){ mtext(substring(chip.names, first = i, last = i), side = 1, at = 1:36, line = i-1, cex = 0.75) mtext(substring(chip.names, first = i, last = i), side = 2, at = 1:36, line = 5-i, cex = 0.75) } abline(v=31 +c(-0.5,0.5)) abline(h=31 +c(-0.5,0.5)) dev.off() #have a look without LEAves which are quite different jpeg(file = "barleyRNAcorimage-LEA.jpg") par(mar=c(5, 5, 4, 2)) image(1:30,1:30,cor(barleyRNA.pm[,-(19:24)]),xaxt = "n", yaxt = "n", xlab = "", ylab = "", col= heat.colors(256)) # possible missclassification #0404-29(ROO_GP_B_29).CEL may really be 0404-29(ROO_MX_B_29).CEL chip.names <- apply(pheno[-(19:24),],1,paste,collapse="") for (i in 1:5){ mtext(substring(chip.names, first = i, last = i), side = 1, at = 1:30, line = i-1, cex = 0.75) mtext(substring(chip.names, first = i, last = i), side = 2, at = 1:30, line = 5-i, cex = 0.75) } abline(h=25 +c(-0.5,0.5)) abline(v=25 +c(-0.5,0.5)) dev.off() ## format to account for gene,tissue,genotype effects barley.pm <- matrix(t(barleyRNA.pm), nrow = 36 * 11, ncol = 22801) colnames(barley.pm) <- probeset11 barley.pm <- as.data.frame(barley.pm) barley.pm$probe <- factor(rep(paste("p",1:11,sep=""),each = 36)) barley.pm$tissue = factor(rep(pheno[,1], 11)) barley.pm$genotype = factor(rep(pheno[,2], 11) ) save(barley.pm,file= "barleyRNApm.RData", compress =T) ## set up full model as a test i <- 1 #now record and remove the main effect of gene,tissue,genotype save residuals res.mat <- matrix(nrow = 396, ncol = 22801) coef.array <- array(dim = c(11,4,22801)) t1 <- proc.time()[3] for (i in 1:22801){ # fit everything except the SFP probe:genotype interaction fit <- summary(lm(barley.pm[,i] ~ probe + tissue + genotype + tissue:genotype, data = barley.pm)) res.mat[,i] <- fit$residuals coef.array[,,i] <- fit$coefficients[-(1:11),] #drop the fixed probe effects } t2 <- proc.time()[3] (t2-t1)/60 ## 127 minutes for naturalj2 res.mat <- t(matrix(res.mat,nr=36)) colnames(res.mat) <- pheno[,2] rownames(res.mat) <- barley.names11xy tisP <- t(coef.array[,4,]) dimnames(tisP) <- list(names(barley.pm)[1:22801], dimnames(fit$coefficients)[[1]][12:22]) tisF <- t(coef.array[,1,]) dimnames(tisF) <- list(names(barley.pm)[1:22801], dimnames(fit$coefficients)[[1]][12:22]) write.table(tisP,file = "TissueGenoExpressionPval.csv", sep = ",") write.table(tisF,file = "TissueGenoExpressionFold.csv", sep = ",") save(res.mat,file = "barley.residuals.RData", compress = T) save(coef.array,file = "barley.coefarray.RData", compress = T) load("barley.residuals.RData") library(siggenes) # load SAM package cl <- colnames(res.mat) # model matrix system.time(sam.out <- sam(res.mat,cl,rand = 123,B=500, p0 = .95))# 500 permutations to get a good estimate of 1% or lower FDR # test 6 arrays for comparision with gDNA power #system.time(sam.out <- sam(res.mat[,1:6],cl[1:6],rand = 123,B=100, p0 = .95)) #system.time(sam.out <- sam(res.mat[,seq(1,36,4)],cl[seq(1,36,4)],rand = 123,B=100, p0 = .95)) # 302.1 seconds for 100 on naturalj2 summary(sam.out,1:5/2) #SAM Analysis for the Two-Class Unpaired Case Assuming Unequal Variances # s0 = 0.0342 (The 5 % quantile of the s values.) # Number of permutations: 500 # MEAN number of falsely called genes is computed. # Delta p0 False Called FDR cutlow cutup j2 j1 #1 0.5 0.95 5883.776 27159 0.20581 -1.60497 1.56704 12837 236490 #2 1.0 0.95 594.278 17744 0.03182 -2.23165 2.21335 8635 241703 #3 1.5 0.95 64.574 13285 0.00462 -2.81474 2.80484 6562 244089 #4 2.0 0.95 6.998 10504 0.00063 -3.37739 3.38004 5301 245609 #5 2.5 0.95 0.462 8583 0.00005 -3.92994 3.94168 4397 246626 jpeg("barleyRNASAMplot.jpg") plot(sam.out,1:20/10) dev.off() #jpeg("barleySAMRNAplotDelta2.jpg") pdf("barleySAMRNAplotDelta2.pdf", width = 11, height = 6,) par(mfrow = c(1,2)) plot(sam.out,2) mtext("A",adj=0,line = 2) hist(pmin(pmax(sam.out@d,-10),10),breaks = 100, main = "Histogram of d-statistics",xlab = "d-statistics") abline(v = c(-3.377,3.38),lty = 2) mtext("B",adj=0,line = 2) dev.off() rnasfp <- summary(sam.out,2,ll=F,file = "barleyRNAsfp.csv",sep = ",",quote=F) #19 header rows save(sam.out, file = "barleyRNAsam.out.RData",compress = T) sfpName <- matrix(unlist(strsplit(rownames(rnasfp$mat.sig),split = "--")),nr=3)[1,] # split off the --xpos--ypos names sfpProbeSetName <- matrix(unlist(strsplit(sfpName,split = "_at")),nr=2)[1,] # split off _at probe number plot(table(table(sfpProbeSetName))) barplot(table(table(sfpProbeSetName))) table(table(sfpProbeSetName)) # # 1 2 3 4 5 6 7 8 9 10 11 #1951 563 262 167 157 123 114 136 108 91 62 # length(table(sfpProbeSetName)) #[1] 3734 featSFP <- rownames(rnasfp$mat.sig) setwd("c:/barley/CELrna") rnaSFP <- read.csv("barleyRNAsfp.csv",skip=19,as.is=T) summary(rnaSFP) sfpName <- matrix(unlist(strsplit(as.character(rnaSFP$Name),split = "--")),nr=3)[1,] # split off the --xpos--ypos names sfpProbeSetName <- matrix(unlist(strsplit(sfpName,split = "_at")),nr=2)[1,] # split off _at probe number jpeg("SFPsProbeSet.jpg") barplot(table(table(sfpProbeSetName)),xlab = "SFPs per ProbeSet", ylab = "Number of SFPs") dev.off() # could probesets with 11 SFPs be half up and half down? otherwise gene experssion, check dstat sign, also large up or large down, could use robust measure sfpSet11 <- names(which(table(sfpProbeSetName)==11)) sfp11tab <- rnaSFP[sfpProbeSetName %in% sfpSet11,] sfp11tab <- sfp11tab[order(as.character(sfp11tab$Name)),] write.table(sfp11tab,file = "sfp11.csv",sep=",",row.names=F) ## try Efron EBAM to estimat a0 in denoinator #find.out <- find.a0(res.mat, as.numeric(factor(cl)), rand = 123) #not enough memory #ebam.out <- ebam(res.mat, gene.names = barley.names11xy) ## compare gDNA and RNA sfps, don't expect too many since different genotypes and not much power in gDNA ## still better than random. library(siggenes) setwd("c:/barley/CELrna") load("barleyRNAsam.out.RData") # reorder from Linux rna.qval <- sam.out@q.value[order(names(sam.out@q.value))] load("../CEL/barleygDNAsam.out.RData") gDNA.qval <- sam.out@q.value[order(names(sam.out@q.value))] table(names(gDNA.qval) == names(rna.qval)) #TRUE #test some thresholds res.mat <- NULL for (i in 10^(-1:-4)) for (j in 1:4/10) res.mat <- c(res.mat,summary(table(gDNA.qval < j,rna.qval < i))$statistic) print(matrix(res.mat,nc=4,dimnames=list(paste("gDNA",1:4/10),paste("rna",10^(-1:-4))))) # rna 0.1 rna 0.01 rna 0.001 rna 1e-04 #gDNA 0.1 39.11611 43.94448 57.57982 43.01975 #gDNA 0.2 95.91104 77.72186 95.60787 85.60262 #gDNA 0.3 107.67837 108.97178 135.67282 123.89319 #gDNA 0.4 133.15521 125.33030 133.02444 115.59271 j <- .3 i <- 0.001 summary(table(gDNA.qval < j,rna.qval < i)) #Number of cases in table: 250811 #Number of factors: 2 #Test for independence of all factors: # Chisq = 135.67, df = 1, p-value = 2.353e-31 table(gDNA.qval < j,rna.qval < i) # FALSE TRUE # FALSE 238329 10847 # TRUE 1467 168 write(names(gDNA.qval)[which(gDNA.qval < j & rna.qval < i)],file = "RNAgDNAsfp.txt") # go with original thresholds setwd("c:/barley/CEL") gDNAsfp <- read.csv("barleygDNAsfp.csv",skip=19,as.is=T) summary(gDNAsfp) allProbe <- names(sam.out@q.value) rSFP <- allProbe %in% rnaSFP$Name dSFP <- allProbe %in% gDNAsfp$Name table(rSFP,dSFP) # dSFP #rSFP FALSE TRUE # FALSE 239331 976 # TRUE 10390 114 summary(table(rSFP,dSFP)) #Number of cases in table: 250811 #Number of factors: 2 #Test for independence of all factors: # Chisq = 107.28, df = 1, p-value = 3.863e-25 write(allProbe[rSFP & dSFP],file = "RNAgDNAsfp.txt") # overlap with sequence data setwd("c:/barley/CELrna") allSNP <- read.csv("../SNPtable.csv") allSNP$SNPscore <- as.character(allSNP$SNPscore) allSNP$SNPscore[grep("\\+",allSNP$SNPscore)] <- 15 # multiple is slot 15 allSNP$snpPos <- as.numeric(allSNP$SNPscore) table(allSNP$snpPos) table(table(allSNP$Probe)) # 1 2 3 #2636 30 1 allSNP <- allSNP[order(allSNP$Probe),] temp <- which(duplicated(allSNP$Probe)) # drop multiples to allSNP[sort(c(temp,temp-1)),] allSNP <- allSNP[-temp,] # drop GP,MX both have changes from probe allSNP <- allSNP[allSNP$SNPgenotype != "GP,MX",] summary(allSNP) save(allSNP,file = "allSNP.RData",compress=T) load("barleyRNAsam.out.RData") # reorder from Linux library(siggenes) sfpName <- matrix(unlist(strsplit(names(sam.out@d),split = "--")),nr=3)[1,] # split off the --xpos--ypos names probeSNP <- match(as.character(allSNP$Probe),sfpName) allSNP[x <- sample(length(probeSNP),10),] sam.out@d[probeSNP][x] allSNP$samD <- sam.out@d[probeSNP] allSNP$samQ <- sam.out@q.value[probeSNP] allSNPsave <- allSNP for (i in c("EST","Random","targeted")){ cat(i) allSNP <- allSNPsave[allSNPsave$dataset== i,] attach(allSNP) # test a few thresholds for (k in seq(1,5,.5)){ # range of chisq agrees with FDR go with 3.2 dstat, 2 delta, 0.001 qvalue 3.5e-5 pvalue print(k) y <- table(cut(x <- samD,c(max(x),-k,k,min(x))),as.character(SNPgenotype)) print(y[,c(1,3,2)]) print(summary(y)) } } allSNP <- allSNPsave attach(allSNP) #mp <- grep("_x|_s",as.character(allSNP$Probe)) # repete probes are working fine table(as.character(SNPgenotype))[c(1,3,2)] # more GP polymorphisms because MX is mostly the design? GP no poly MX 223 2200 178 # with original threshold table( cut(sam.out@d, c(-1000, -3.37739, 3.3800 ,1000)) ) # cutpoint 3.38 is just below 3.38004 to catch flanking SFP #GP is positive scores as ref is greater stat #(-1e+03,-3.38] (-3.38,3.38] (3.38,1e+03] # 5301 240307 5203 geno <- cut(samD, c(-1000, -3.37739, 3.3800 ,1000), labels = c("mxSFP","nonSFP","gpSFP") ) y <- table(as.character(SNPgenotype), geno) print(summary(y)) #Number of cases in table: 2601 #Number of factors: 2 #Test for independence of all factors: # Chisq = 2049.2, df = 4, p-value = 0 y <- y[c(2,3,1),] print(y) # geno # mxSFP nonSFP gpSFP # MX 115 45 18 # no poly 27 2045 128 # GP 7 61 155 cat("Sensitivity", sum(y[1,1]+y[3,3])/sum(y[c(1,3),]), "\n" ) #Sensitivity 0.6733167 cat("False sequence polymorphism Rate", sum(y[2:3,1]+y[1:2,3])/sum(y[,c(1,3)]), "\n") #False sequence polymorphism Rate 0.4 # % variance explained summary(lm(samD~SNPgenotype)) # 38% # absolute value but contains misscalls thresh <- quantile(sam.out@q.value, 10504/length(sam.out@q.value) ) table(sam.out@q.value < thresh) # FALSE TRUE #240307 10504 table(SNPgenotype!="no poly", samQ < thresh ) # FALSE TRUE # FALSE 2045 155 # TRUE 106 295 summary(table(SNPgenotype!="no poly", samQ < thresh )) #Number of cases in table: 2601 #Number of factors: 2 #Test for independence of all factors: # Chisq = 1049, df = 1, p-value = 4.055e-230 offGP <- which(geno == "mxSFP" & SNPgenotype == "GP") offMX <- which(geno == "gpSFP" & SNPgenotype == "MX") y <- table(allSNP$snpPos[-c(offGP,offMX)], geno[-c(offGP,offMX)]) y # mxSFP nonSFP gpSFP # 1 2 10 1 # 2 1 10 1 # 3 2 13 4 # 4 8 8 5 # 5 6 7 11 # 6 14 5 18 # 7 10 8 14 # 8 5 1 15 # 9 5 10 11 # 10 12 7 10 # 11 4 6 18 # 12 9 4 20 # 13 8 2 9 # 15 15 5 9 # getS3method("plot","table") rownames(y)[14] <- "Multiple" pdf("SNPpositionSFPcall.pdf") mosaicplot(y, # expect to see middle group shrink as SFP calling improves above 4 main= "SNP position vs SFP call", ylab = "SFP call", xlab = "SNP position") cex <- 0.6 mtext(y[,1],side = 1,line = -1,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex) mtext(y[,2],side = 1,line = -11,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex) mtext(y[,3],side = 1,line = -20,at= cumsum(rowSums(y)/sum(y+1-cex)),cex=cex) barplot(t(y)) dev.off() # plot some raw data .pdf's to look at SFPs setwd("c:/barley/CELrna") library(barley1probe) data(barley1probe) load("C:/barley/CELrna/barleyRNApm.RData") sfpSet <- paste(names(table(sfpProbeSetName)),"at",sep="_") tis <- as.numeric(barley.pm$tissue) gen <- as.numeric(barley.pm$genotype) prob <- as.numeric(barley.pm$probe) pdf(file = "barleyRNAbpSFPset.pdf") par(mfrow = c(2,3)) for (i in 1:100){# first so many genes with SFPs #i <- 13 probeSet <- sfpSet[i] # show bp? xpos <- barley1probe$Probe.Interrogation.Position[barley1probe$Probe.Set.Name == probeSet] #xpos <- 1:11 gene.plot <- barley.pm[,probeSet] fit <- lm(gene.plot ~ probe + tissue + genotype + tissue:genotype # + probe:genotype # leave this off for the main effect fits , data = barley.pm) probe.effect <- c(0,fit$coefficients[2:11])[prob] tissue.effect <- c(0,fit$coefficients[12:16])[tis] geno.effect <- c(0,fit$coefficients[17])[gen] tissue.geno <- c(0,fit$coefficients[18:22])[tis] tissue.geno[gen == 1] <- 0 # coefs only effect mx genotype # raw data matplot(xpos, t(matrix(gene.plot,nr = 36)), type = "l", col = gen, lty = tis, xlab = probeSet, ylab = "log intensity", main = "raw data") # remove probe matplot(xpos, t(matrix(gene.plot - probe.effect,nr = 36)), type = "l", col = gen, lty = tis, xlab = probeSet, ylab = "log intensity", main = "- probe") # remove probe and tissue matplot(xpos, t(matrix(gene.plot - probe.effect - tissue.effect,nr = 36)), type = "l", col = gen, lty = tis, xlab = probeSet, ylab = "log intensity", main = "-probe - tissue") # remove probe and genotype matplot(xpos, t(matrix(gene.plot - probe.effect - geno.effect,nr = 36)), type = "l", col = gen, lty = tis, main = "- probe - geno", xlab = probeSet, ylab = "log intensity") # remove probe and genotype and tissue matplot(xpos, t(matrix(gene.plot - probe.effect - geno.effect - tissue.effect,nr = 36)), type = "l", col = gen, lty = tis, main = "- probe - geno - tissue", xlab = probeSet, ylab = "log intensity") # remove probe and genotype and tissue and tissue*geno matplot(xpos,t(matrix(gene.plot - probe.effect - geno.effect - tissue.effect - tissue.geno, nr = 36)), type = "l", col = gen, lty = tis, main = "- probe - geno - tissue - geno*tissue", xlab = probeSet, ylab = "log intensity",cex.main = 0.6 ) cat(probeSet,"\n") }# close loop for genes dev.off() # close .pdf ####### test certain tissues as a power question # with 1 tissue there can be general genotype gene expression differences and probe level differences SFPs setwd("c:/barley/CELrna") load("barleyRNApm.RData") table(barley.pm$genotype,barley.pm$tissue) # COL CRO GEM LEA RAD ROO # GP 33 33 33 33 33 22 # MX 33 33 33 33 33 44 #arrays*11 probes tis <- names(table(barley.pm$tissue)) load("C:/barley/allSNP.RData") sfpName <- paste(rep(names(barley.pm)[1:22801],each=11),1:11,sep="") #load("C:/barley/CELrna/probeNames.RData") #sfpName <- matrix(unlist(strsplit(barley.names11xy,split = "--")),nr=3)[1,] # split off the --xpos--ypos names probeSNP <- match(as.character(allSNP$Probe),sfpName) cbind(allSNP[1000:1010,],sfpName[probeSNP][1000:1010]) #check naming match fdr.tab <- matrix(nr=3,ncol = length(tis)) rownames(fdr.tab) <- c("Sensitivity","False sequence polymorphism rate", "%var") colnames(fdr.tab) <- tis library(siggenes) # load SAM package for (i in 1:6){ barley.pmT <- barley.pm[barley.pm$tissue == tis[i],] ## set up full model as a test #now record and remove the main effect of gene,tissue,genotype save residuals res.mat <- lsfit(as.numeric(barley.pmT$genotype),barley.pmT[,1:22801])$residuals res.mat <- t(matrix(res.mat,nr=6)) colnames(res.mat) <- barley.pmT$genotype[1:6] print(colnames(res.mat)) cl <- colnames(res.mat) # model matrix system.time(sam.out <- sam(res.mat,cl,rand = 123,B=500, p0 = .95))# can only do 20 perms #compare with SNP table allSNP$samD <- sam.out@d[probeSNP] allSNP$samQ <- sam.out@q.value[probeSNP] attach(allSNP) # choose same thresholds as for 36 arrays cuts <- quantile(sam.out@d,c(0, 5301, length(sam.out@d)-5204, length(sam.out@d))/length(sam.out@d)) table(cut(sam.out@d,cuts,right=F)) geno <- cut(samD, cuts, labels = c("mxSFP","nonSFP","gpSFP"),right=F ) y <- table(as.character(SNPgenotype), geno) print(tis[i]) print(summary(y)) y <- y[c(2,3,1),] print(y) fdr.tab[1,i] <- sum(y[1,1]+y[3,3])/sum(y[c(1,3),]) fdr.tab[2,i] <- sum(y[2:3,1]+y[1:2,3])/sum(y[,c(1,3)]) fdr.tab[3,i] <- summary(lm(samD~SNPgenotype))$r.squared }# close tis loop write.table(fdr.tab,file = "fdr.tissue.csv", sep = ",") ## use same threshold as with 36 arrays thresh <- quantile(sam.out@q.value, 10504/length(sam.out@q.value) ) table(sam.out@q.value < thresh) table(SNPgenotype!="no poly", samQ < thresh ) # FALSE TRUE # FALSE 2029 75 # TRUE 148 177 summary(table(SNPgenotype!="no poly", samQ < thresh )) #Number of cases in table: 2429 #Number of factors: 2 #Test for independence of all factors: # Chisq = 784.3, df = 1, p-value = 1.401e-172 summary(lm(SNPgenotype!="no poly" ~ samQ < thresh )) #33% r2 summary(lm(log(samQ) ~ SNPgenotype!="no poly"))