##########################################################
### Example code for NB permutation test 
### P-values of 10,000 SNPs are assumed to follow uniform distribution 
### sample size is 5,000 
##########################################################

library(mvtnorm)
set.seed(123)

### marker generation with mafs from a uniform distribution for insignificant markers 

mat1<- matrix(NA, nrow=5000, ncol=10000)
for(i in 1:10000)
 { 
   mat1[,i]<-rbinom(n=5000,size=2,prob=runif(1,min=0.05, max=0.50))
 }
y1<-rnorm(5000,mean=0,sd=10)

res1<-matrix(NA,nrow=10000, ncol=4)  #### linear model results matrix 
colnames(res1)<-c("estimate","se","t-value","p-value") 

for(i in 1:10000)
{ 
 res1[i,]<- summary(lm(y1~mat1[,i]))$coef[2,] 
} 

write.csv(cbind(y1,mat1),"data_simulated.csv",row.names=F)
write.csv(res1,"res_ori_stat.csv",row.names=F)


###########################################
### ENPP approach to divide SNPs into large p-value group and small p-value group 
### p_adj and p_prun are set to be 0.001 as in the article. 
###########################################

p1adj = 0.001   ##### Bonferroni type significance level 
p1prun <- p1adj 
coun1<-c(rep(NA,10^4))  ### permutation round is assumed to be 10,000 for pruning process 

for(i in 1:(10^4))
{ 
  p1val<- as.numeric(as.character(1-pbinom(0:min(i-1,30) ,i,p1adj)))
  coun1[i]<-which(signif(p1val,10)<=signif(p1prun,10))[1]
}

table(coun1)  #### coun1 is cprun for each permutation round 

###########################################

data1<-read.csv("data_simulated.csv",header=T)
res1<-read.csv("res_ori_stat.csv",header=T)
y1<-data1[,1]
mat1<-data1[,-1] 

res1enpp<-matrix(NA,nrow=10000,ncol=5)
colnames(res1enpp)<-c("id","success","total","ori_stat","ori_p")

set.seed(123)
for(i in 1:10000)
{ 
  s1<-0 
  for(j in 1:10000)
  {
    y11<-sample(y1)
    lm_res<-summary(lm(y11~mat1[,i]))$coef[2,3]
    if(abs(lm_res)>=abs(res1[i,3])){s1<-s1+1} 
    if(s1>=coun1[j]){break}
  }
  res1enpp[i,]<-c(i,s1,j,res1[i,3],res1[i,4])
  if(i%%20==0){print(i)}
}

############################################
### large p-value group estimation (with number of success 120) 
############################################

loca1<-which(res1enpp[,3]==10000)   ### small p-value group 
loca2<-setdiff(c(1:10000),loca1)         ### large p-value group 

mat1large<-matrix(NA,nrow=10000, ncol=3)
mat1large[,1]<-c(1:10000)
set.seed(123) 
for(i in loca2)
{ 
  s1<-0                                        ### number of success 
  for(j in 1:10^8)
  { 
    y11<-sample(y1)
    res1large<-summary(lm(y11~mat1[,i]))$coef[2,3]
    if(abs(res1large)>=abs(res1enpp[i,4])){s1<-s1+1} 
    if(s1>=120){break}
    if(j%%2000==0){print(j)}
   }
  mat1large[i,2:3]<-as.matrix(c(120, j)) 
  print(c("i",i,"s1",s1,"j",j))
}

write.csv(mat1large,"res_large_p_group_example.csv",row.names=F)

#############################################
### data splitting for small p-value group estimation (K=3) 
#############################################

loca1<-which(res1enpp[,3]==10000)   ### small p-value group 
leng1<-length(loca1)
mat1loca<-matrix(NA,nrow=leng1,ncol=5000)
mat1cv<-matrix(NA,nrow=leng1,ncol=3)

set.seed(123)
for(i in 1:leng1)
{
  cv1<-rep(NA,1000)
  mat1split<-matrix(NA,nrow=1000,ncol=5000)
  res1cv<-matrix(NA,nrow=1000,ncol=3)
  for(j in 1:1000)
  {   
    sam1<-sample(1:5000); sam11<-sam1[1:1666]; sam12<-sam1[1667:3333]; sam13<-sam1[3334:5000]
    y11<-y1[sam11]; y12<-y1[sam12]; y13<-y1[sam13]
    mat11<-mat1[sam11,loca1[i]]; mat12<-mat1[sam12,loca1[i]]; mat13<-mat1[sam13,loca1[i]]; 
    res1s<-summary(lm(y11~mat11))$coef[2,3]
    res2s<-summary(lm(y12~mat12))$coef[2,3]
    res3s<-summary(lm(y13~mat13))$coef[2,3]
    cv1[j]<-sd(c(res1s,res2s,res3s))/abs(mean(c(res1s,res2s,res3s)))
    mat1split[j,]<-sam1 
    res1cv[j,]<-as.matrix(c(res1s,res2s,res3s))
   } 
    loca1cv<-which.min(cv1)
    mat1cv[i,]<-res1cv[loca1cv,]
    mat1loca[i,]<-mat1split[loca1cv,]
    print(i)
}

write.csv(mat1loca,"mat1loca.csv",row.names=F)
write.csv(mat1cv,"mat1cv.csv",row.names=F)

#############################################
### small p-value group estimation (k=3 and number of success 330) 
#############################################

mat1loca<-read.csv("mat1loca.csv",header=T)
mat1cv<-read.csv("mat1cv.csv",header=T)
mat1small<-matrix(NA,nrow=22,ncol=9)

set.seed(123)
for(i in 1:leng1)
{
 sam1<-as.numeric(mat1loca[i,1:1666]); sam2<-as.numeric(mat1loca[i,1667:3333]); sam3<-as.numeric(mat1loca[i,3334:5000])
 y11<-y1[sam1]; y12<-y1[sam2]; y13<-y1[sam3]; 
 mat1s<-mat1[sam1,loca1[i]];  mat2s<-mat1[sam2,loca1[i]];  mat3s<-mat1[sam3,loca1[i]]; 
 s11<-0; s12<-0; s21<-0; s22<-0; s31<-0; s32<-0; 
 for(j in 1:10^8)
 {
   res1small<-summary(lm(sample(y11)~mat1s))$coef[2,3]
   if(res1small>=abs(mat1cv[i,1])){s11<-s11+1}
   if(res1small<=-abs(mat1cv[i,1])){s12<-s12+1}
   if(s11+s12>=330){break}
 }
   mat1small[i,1:3]<-as.matrix(c(s11,s12,j))
   print(c("s1 finished","s11",s11,"s12",s12,"j",j))
 for(j in 1:10^8)
 {
   res2small<-summary(lm(sample(y12)~mat2s))$coef[2,3]
   if(res2small>=abs(mat1cv[i,2])){s21<-s21+1}
   if(res2small<=-abs(mat1cv[i,2])){s22<-s22+1}
   if(s21+s22>=330){break}
 }
   mat1small[i,4:6]<-as.matrix(c(s21,s22,j))
   print(c("s2 finished","s21",s21,"s22",s22,"j",j))
 for(j in 1:10^8)
 {
   res3small<-summary(lm(sample(y13)~mat3s))$coef[2,3]
   if(res3small>=abs(mat1cv[i,3])){s31<-s31+1}
   if(res3small<=-abs(mat1cv[i,3])){s32<-s32+1}
   if(s31+s32>=330){break}
 }
   mat1small[i,7:9]<-as.matrix(c(s31,s32,j))
   print(c("s3 finished","s31",s31,"s32",s32,"j",j))
  print(i)
}

write.csv(mat1small,"res_small_p_group_example.csv",row.names=F)

#############################################
### data summarization for final results 
#############################################

result1large<-read.csv("res_large_p_group_example.csv",header=T)
result1small<-read.csv("res_small_p_group_example.csv",header=T)
result1total<-matrix(NA,nrow=10000,ncol=2); result1total[,1]<-c(1:10000)
result1total[-loca1,2]<-(result1large[-loca1,2]-1)/(result1large[-loca1,3]-1)

###  -1 can be omitted 
p11<-(result1small[,1]-1)/(result1small[,3]-1); p12<-(result1small[,2]-1)/(result1small[,3]-1)
p21<-(result1small[,4]-1)/(result1small[,6]-1); p22<-(result1small[,5]-1)/(result1small[,6]-1)
p31<-(result1small[,7]-1)/(result1small[,9]-1); p32<-(result1small[,8]-1)/(result1small[,9]-1)

p1tot<-1-pnorm((qnorm(1-p11)+qnorm(1-p21)+qnorm(1-p31))/sqrt(3))
p2tot<-1-pnorm((qnorm(1-p12)+qnorm(1-p22)+qnorm(1-p32))/sqrt(3))

result1total[loca1,2]<-p1tot+p2tot

write.csv(result1total,"result_p_total.csv",row.names=F)

######################