########################################################## ### Example code for NB permutation test ### P-values of 10,000 SNPs are assumed to follow uniform distribution ### sample size is 5,000 ########################################################## library(mvtnorm) set.seed(123) ### marker generation with mafs from a uniform distribution for insignificant markers mat1<- matrix(NA, nrow=5000, ncol=10000) for(i in 1:10000) { mat1[,i]<-rbinom(n=5000,size=2,prob=runif(1,min=0.05, max=0.50)) } y1<-rnorm(5000,mean=0,sd=10) res1<-matrix(NA,nrow=10000, ncol=4) #### linear model results matrix colnames(res1)<-c("estimate","se","t-value","p-value") for(i in 1:10000) { res1[i,]<- summary(lm(y1~mat1[,i]))$coef[2,] } write.csv(cbind(y1,mat1),"data_simulated.csv",row.names=F) write.csv(res1,"res_ori_stat.csv",row.names=F) ########################################### ### ENPP approach to divide SNPs into large p-value group and small p-value group ### p_adj and p_prun are set to be 0.001 as in the article. ########################################### p1adj = 0.001 ##### Bonferroni type significance level p1prun <- p1adj coun1<-c(rep(NA,10^4)) ### permutation round is assumed to be 10,000 for pruning process for(i in 1:(10^4)) { p1val<- as.numeric(as.character(1-pbinom(0:min(i-1,30) ,i,p1adj))) coun1[i]<-which(signif(p1val,10)<=signif(p1prun,10))[1] } table(coun1) #### coun1 is cprun for each permutation round ########################################### data1<-read.csv("data_simulated.csv",header=T) res1<-read.csv("res_ori_stat.csv",header=T) y1<-data1[,1] mat1<-data1[,-1] res1enpp<-matrix(NA,nrow=10000,ncol=5) colnames(res1enpp)<-c("id","success","total","ori_stat","ori_p") set.seed(123) for(i in 1:10000) { s1<-0 for(j in 1:10000) { y11<-sample(y1) lm_res<-summary(lm(y11~mat1[,i]))$coef[2,3] if(abs(lm_res)>=abs(res1[i,3])){s1<-s1+1} if(s1>=coun1[j]){break} } res1enpp[i,]<-c(i,s1,j,res1[i,3],res1[i,4]) if(i%%20==0){print(i)} } ############################################ ### large p-value group estimation (with number of success 120) ############################################ loca1<-which(res1enpp[,3]==10000) ### small p-value group loca2<-setdiff(c(1:10000),loca1) ### large p-value group mat1large<-matrix(NA,nrow=10000, ncol=3) mat1large[,1]<-c(1:10000) set.seed(123) for(i in loca2) { s1<-0 ### number of success for(j in 1:10^8) { y11<-sample(y1) res1large<-summary(lm(y11~mat1[,i]))$coef[2,3] if(abs(res1large)>=abs(res1enpp[i,4])){s1<-s1+1} if(s1>=120){break} if(j%%2000==0){print(j)} } mat1large[i,2:3]<-as.matrix(c(120, j)) print(c("i",i,"s1",s1,"j",j)) } write.csv(mat1large,"res_large_p_group_example.csv",row.names=F) ############################################# ### data splitting for small p-value group estimation (K=3) ############################################# loca1<-which(res1enpp[,3]==10000) ### small p-value group leng1<-length(loca1) mat1loca<-matrix(NA,nrow=leng1,ncol=5000) mat1cv<-matrix(NA,nrow=leng1,ncol=3) set.seed(123) for(i in 1:leng1) { cv1<-rep(NA,1000) mat1split<-matrix(NA,nrow=1000,ncol=5000) res1cv<-matrix(NA,nrow=1000,ncol=3) for(j in 1:1000) { sam1<-sample(1:5000); sam11<-sam1[1:1666]; sam12<-sam1[1667:3333]; sam13<-sam1[3334:5000] y11<-y1[sam11]; y12<-y1[sam12]; y13<-y1[sam13] mat11<-mat1[sam11,loca1[i]]; mat12<-mat1[sam12,loca1[i]]; mat13<-mat1[sam13,loca1[i]]; res1s<-summary(lm(y11~mat11))$coef[2,3] res2s<-summary(lm(y12~mat12))$coef[2,3] res3s<-summary(lm(y13~mat13))$coef[2,3] cv1[j]<-sd(c(res1s,res2s,res3s))/abs(mean(c(res1s,res2s,res3s))) mat1split[j,]<-sam1 res1cv[j,]<-as.matrix(c(res1s,res2s,res3s)) } loca1cv<-which.min(cv1) mat1cv[i,]<-res1cv[loca1cv,] mat1loca[i,]<-mat1split[loca1cv,] print(i) } write.csv(mat1loca,"mat1loca.csv",row.names=F) write.csv(mat1cv,"mat1cv.csv",row.names=F) ############################################# ### small p-value group estimation (k=3 and number of success 330) ############################################# mat1loca<-read.csv("mat1loca.csv",header=T) mat1cv<-read.csv("mat1cv.csv",header=T) mat1small<-matrix(NA,nrow=22,ncol=9) set.seed(123) for(i in 1:leng1) { sam1<-as.numeric(mat1loca[i,1:1666]); sam2<-as.numeric(mat1loca[i,1667:3333]); sam3<-as.numeric(mat1loca[i,3334:5000]) y11<-y1[sam1]; y12<-y1[sam2]; y13<-y1[sam3]; mat1s<-mat1[sam1,loca1[i]]; mat2s<-mat1[sam2,loca1[i]]; mat3s<-mat1[sam3,loca1[i]]; s11<-0; s12<-0; s21<-0; s22<-0; s31<-0; s32<-0; for(j in 1:10^8) { res1small<-summary(lm(sample(y11)~mat1s))$coef[2,3] if(res1small>=abs(mat1cv[i,1])){s11<-s11+1} if(res1small<=-abs(mat1cv[i,1])){s12<-s12+1} if(s11+s12>=330){break} } mat1small[i,1:3]<-as.matrix(c(s11,s12,j)) print(c("s1 finished","s11",s11,"s12",s12,"j",j)) for(j in 1:10^8) { res2small<-summary(lm(sample(y12)~mat2s))$coef[2,3] if(res2small>=abs(mat1cv[i,2])){s21<-s21+1} if(res2small<=-abs(mat1cv[i,2])){s22<-s22+1} if(s21+s22>=330){break} } mat1small[i,4:6]<-as.matrix(c(s21,s22,j)) print(c("s2 finished","s21",s21,"s22",s22,"j",j)) for(j in 1:10^8) { res3small<-summary(lm(sample(y13)~mat3s))$coef[2,3] if(res3small>=abs(mat1cv[i,3])){s31<-s31+1} if(res3small<=-abs(mat1cv[i,3])){s32<-s32+1} if(s31+s32>=330){break} } mat1small[i,7:9]<-as.matrix(c(s31,s32,j)) print(c("s3 finished","s31",s31,"s32",s32,"j",j)) print(i) } write.csv(mat1small,"res_small_p_group_example.csv",row.names=F) ############################################# ### data summarization for final results ############################################# result1large<-read.csv("res_large_p_group_example.csv",header=T) result1small<-read.csv("res_small_p_group_example.csv",header=T) result1total<-matrix(NA,nrow=10000,ncol=2); result1total[,1]<-c(1:10000) result1total[-loca1,2]<-(result1large[-loca1,2]-1)/(result1large[-loca1,3]-1) ### -1 can be omitted p11<-(result1small[,1]-1)/(result1small[,3]-1); p12<-(result1small[,2]-1)/(result1small[,3]-1) p21<-(result1small[,4]-1)/(result1small[,6]-1); p22<-(result1small[,5]-1)/(result1small[,6]-1) p31<-(result1small[,7]-1)/(result1small[,9]-1); p32<-(result1small[,8]-1)/(result1small[,9]-1) p1tot<-1-pnorm((qnorm(1-p11)+qnorm(1-p21)+qnorm(1-p31))/sqrt(3)) p2tot<-1-pnorm((qnorm(1-p12)+qnorm(1-p22)+qnorm(1-p32))/sqrt(3)) result1total[loca1,2]<-p1tot+p2tot write.csv(result1total,"result_p_total.csv",row.names=F) ######################