################################# ################################# ### R script for application of ENPP ################################# ### example data set generation ### 1000 markers (assumed to be SNP markers) in total : 10 markers are significant, 990 markers are not significant ### 5000 individuals ### 10,000 permutation rounds for pruning step, and 39,999 permutations for unpruned markers ### total number of permutations for unpruned markers is 50,000(=10,000+39,999+1(=original data analysis)) ################################# ################################# library(mvtnorm) set.seed(123) sigma1<- matrix(0.2, nrow=11, ncol=11); diag(sigma1)<- 1 ; sigma1<-sigma1*10 sig1<- rmvnorm(5000, mean=rep(100,11), sigma=sigma1) y1<-sig1[,1] #### response variable mat1<- matrix(NA, nrow=5000, ncol=10) ### marker generation with maf=0.1 and Hardy-Weinberg principle for significant markers for(i in 1:10) { rank1<-rank(sig1[,i+1]) mat1[which(rank1<=4050),i]<-0 mat1[which(rank1 > 4050 & rank1 <= 4950),i]<-1 mat1[which(rank1 >= 4950),i]<-2 } ### marker generation with mafs from a uniform distribution for insignificant markers mat2<- matrix(NA, nrow=5000, ncol=990) for(i in 1:990) { mat2[,i]<-rbinom(n=5000,size=2,prob=runif(1,min=0.05, max=0.50)) } mat_tot<-cbind(mat1,mat2) #### explanatory variables in total res1<-matrix(NA,nrow=1000, ncol=4) #### linear model results matrix colnames(res1)<-c("estimate","se","t-value","p-value") for(i in 1:1000) { res1[i,]<- summary(lm(y1~mat_tot[,i]))$coef[2,] } res1[1:20,] ### first 20 results of 1,000 markers with parametric approaches ############################# ############################# ### setting p-adjust, p-prun, and c-prun values, introduced in the article. ############################# ############################# n1=10^3 p1 = 0.05 p1adj = p1/n1 ##### Bonferroni type significance level p1prun <- p1adj coun1<-c(rep(NA,10^4)) ### permutation round is assumed to be 10,000 for pruning process for(i in 1:(10^4)) { p1val<- as.numeric(as.character(1-pbinom(0:min(i-1,20) ,i,p1adj))) coun1[i]<-which(signif(p1val,10)<=signif(p1prun,10))[1] } table(coun1) #### coun1 is cprun for each permutation round ############################# ############################# ### pruning step with 10,000 permutation rounds (assume that we are using only one thread) ############################# ############################# set.seed(123) loca1<-c(1:1000) res2<-matrix(NA,nrow=1000,ncol=10^4) stat1<-abs(res1[,3]) sum1<-0 for(i in 1:10000) { y1n<-sample(y1) res3<-matrix(NA, nrow=1000,ncol=4) for(j in loca1) { res3[j,] <- summary(lm(y1n~mat_tot[,j]))$coef[2,] } print(c("i",i,"regression done")) loca2<-which(stat1<=abs(res3[,3])) ### for case of having larger statistics than the original statistics. loca3<-setdiff(loca1,loca2) ### for case of having smaller statistics than the original statistics. res2[loca2,i]<-1 res2[loca3,i]<-0 sum1<-sum1+res2[,i] loca4<-which(sum1>=coun1[i]) loca1<-setdiff(loca1,loca4) print(c("i",i,"number of survived",length(loca1))) } length(loca1) ; loca1 #### SNPs survived from the approach mat_tot2<-mat_tot[,loca1] #### dataset of survived markers (unpruned) ############################# ############################# ### final evaluation step (39,999(=50,000-10,001) permutaion is additionally applied for each unpruned marker, to obtain final precise p-value) ### parallel computing (multicore approach) can be applied ############################# ############################# set.seed(1234) res4<-matrix(NA, nrow=39999,ncol=length(loca1)) for(i in 1:length(loca1)) { for(j in 1: 39999) { y1n<-sample(y1) res4[j,i]<- abs(summary(lm(y1n~mat_tot2[,i]))$coef[2,3]) if(j%%1000==0){print(c("i",i,"j",j))} } } res5<-matrix(NA,nrow=1000,ncol=2) ### results of pruning step res6<-matrix(NA,nrow=length(loca1),ncol=2) ### results of post-pruning step with 10 unpruned markers for(i in 1:1000) { res5[i,1]<- sum(!is.na(res2[i,])) res5[i,2]<- sum(res2[i,],na.rm=T) } for(i in loca1) { res6[i,1]<- 39999 res6[i,2]<- sum(stat1[loca1[i]]<=res4[,i]) } res7<-res5 #### combining permutation results of pruning step and post pruning step res7[loca1,]<-res5[loca1,]+res6 ; res7<- res7+1 res_fin<- cbind(res1,res7,res7[,2]/res7[,1]) ### final results colnames(res_fin)<- c("ori_estimate","ori_se","ori_t-value","ori_p-value","permut_times","exceeding_times","permut_p-value") ################################################################