2021-03-22-simulation
  • 更新gama, pi0
In [45]:
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
library(data.table)
rd = 0
options(scipen=200)
options(digits=22)

N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift
gm.lst = c()

#### par of simulation
n_risk_genes = 874
sim_gama = 50
# sim_gama = runif(1,min=30,max=100) 
##########################

repeat{
rd = rd + 1

gama=100 ### initial gamma
    
Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate")

mg = merge(Mydata,IR,by="genename")
mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"
mg2$risk = rep(.05,nrow(mg2))

# mydata3 = rep(n_risk_genes/17478,nrow(mg2))
# p = mg2$risk
# p = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# mg2$risk = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# p = runif(nrow(mg2),min=1e-10,max=1)    
    
mg2$nonrisk = 1 - mg2$risk
mg2 = mg2[order(mg2$nonrisk),]

mg2$sim_fs_ct = 0

sp = sample(1:nrow(mg2),n_risk_genes,replace = F)

mg2[sp,]$sim_fs_ct = rpois(n_risk_genes,
                                       fs_fraction * mg2[sp,]$sum_indel_rate * sim_gama)
    
mg2[-sp,]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes, 
                                                     fs_fraction * mg2[-sp,]$sum_indel_rate)

p = mg2$risk

repeat{
  mydata3 = p ### prior
#   mydata4 = mg2$observed_frameshift_count          
  mydata4 = mg2$sim_fs_ct  #### obs
  mydata5 = fs_fraction * mg2$sum_indel_rate  #### exp 
    
  p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
                                                          mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

#   p=(exp(log(mydata3)+pois(mydata4,gama*mydata5)))/(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
    
  tes=gama
  gama=sum(p*mydata4)/sum(p*mydata5)
  if(abs(gama-tes)/tes < 1e-5) break
  
}

q0 = 1-p  ### risk probability of non-risk genes
mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR <- numeric(nrow(mg2))
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR
    
if(rd > 30)break
gm.lst = c(gm.lst,gama)
    }
plot(gm.lst)
sd(gm.lst)
mean(gm.lst)
1.00117572995601
55.8621024078328
  • 不更新pi0
In [46]:
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
library(data.table)
rd = 0
options(scipen=200)
options(digits=22)

N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift
gm.lst = c()

#### par of simulation
n_risk_genes = 874
sim_gama = 50
# sim_gama = runif(1,min=30,max=100) 
##########################

repeat{
rd = rd + 1

gama=100 ### initial gamma
    
Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate")

mg = merge(Mydata,IR,by="genename")
mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"
mg2$risk = rep(.05,nrow(mg2))

# mydata3 = rep(n_risk_genes/17478,nrow(mg2))
# p = mg2$risk
# p = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# mg2$risk = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# p = runif(nrow(mg2),min=1e-10,max=1)    
    
mg2$nonrisk = 1 - mg2$risk
mg2 = mg2[order(mg2$nonrisk),]

mg2$sim_fs_ct = 0

sp = sample(1:nrow(mg2),n_risk_genes,replace = F)

mg2[sp,]$sim_fs_ct = rpois(n_risk_genes,
                                       fs_fraction * mg2[sp,]$sum_indel_rate * sim_gama)
    
mg2[-sp,]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes, 
                                                     fs_fraction * mg2[-sp,]$sum_indel_rate)

p = mg2$risk

repeat{
  mydata3 = mg2$risk ### prior
#   mydata4 = mg2$observed_frameshift_count          
  mydata4 = mg2$sim_fs_ct  #### obs
  mydata5 = fs_fraction * mg2$sum_indel_rate  #### exp 
    
  p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
                                                          mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

#   p=(exp(log(mydata3)+pois(mydata4,gama*mydata5)))/(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
    
  tes=gama
  gama=sum(p*mydata4)/sum(p*mydata5)
  if(abs(gama-tes)/tes < 1e-5) break
  
}

q0 = 1-p  ### risk probability of non-risk genes
mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR <- numeric(nrow(mg2))
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR
    
if(rd > 30)break
gm.lst = c(gm.lst,gama)
    }
plot(gm.lst)
sd(gm.lst)
mean(gm.lst)
1.65769832084458
50.200953486531
In [19]:
mg2$idx = 1:nrow(mg2)

fwrite(mg2,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/tmp/mg2.txt",sep="\t")
In [75]:
optim(.1,fr,method='Brent',lower=0.01,upper=1000,control=list('fnscale'=-1, "maxit" = 2000))
$par
34.2121066119019
$value
-4057.70879571264
$counts
function
<NA>
gradient
<NA>
$convergence
0
$message
NULL
In [71]:
fr=function(x){
    gama=x
     mydata3 =  p
#   mydata4 = mg2$observed_frameshift_count          
  mydata4 = mg2$sim_fs_ct  #### obs
  mydata5 = fs_fraction * mg2$sum_indel_rate  #### exp 
  log_P=sum(log(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))  )
       
}
In [6]:
pois=function(k,lambda)
{
  k*log(lambda)-lambda-log(factorial(k))
}

LL=log(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
In [ ]:
# n_risk_genes.lst = c(n_risk_genes.lst, n_risk_genes)

# mean.gm.lst = c(mean.gm.lst,mean(gm.lst))

# sd.gm.lst = c(sd.gm.lst, sd(gm.lst))


# z = data.frame(rcd_sim_gm ,n_risk_genes.lst,mean.gm.lst,sd.gm.lst)
# z

mg2$post = q
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR <- numeric(n)
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg2$FDR2 =  mg2$q0 * nrow(mg2) / rank(mg2$q0)  # mg2$FDR3 = p.adjust(mg2$q0,method="fdr",n=nrow(mg2))

ct =nrow(mg2[mg2$FDR < 0.1,])
rcd_res_g = c(rcd_res_g,ct)

res_gama = gama

rcd_res_gm = c(rcd_res_gm, gama)
round = c(round,rd)
sim_risk_g = c(sim_risk_g,n_risk_genes)
dff = data.frame(round,rcd_sim_gm ,sim_risk_g,rcd_sim_fs,rcd_res_g ,rcd_res_gm)
# if(rd > 3)break
#     }
dff

dff$err = abs(dff$rcd_sim_gm - dff$rcd_res_gm)/dff$rcd_sim_gm

max(dff$err)

library(ggplot2)
ggplot(data = dff, aes(round,err)) + geom_line() + ylim(0,1) + geom_hline(yintercept=mean(dff$err),linetype="dashed")