- 更新gama, pi0
In [45]:
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
library(data.table)
rd = 0
options(scipen=200)
options(digits=22)
N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift
gm.lst = c()
#### par of simulation
n_risk_genes = 874
sim_gama = 50
# sim_gama = runif(1,min=30,max=100)
##########################
repeat{
rd = rd + 1
gama=100 ### initial gamma
Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate")
mg = merge(Mydata,IR,by="genename")
mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"
mg2$risk = rep(.05,nrow(mg2))
# mydata3 = rep(n_risk_genes/17478,nrow(mg2))
# p = mg2$risk
# p = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# mg2$risk = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# p = runif(nrow(mg2),min=1e-10,max=1)
mg2$nonrisk = 1 - mg2$risk
mg2 = mg2[order(mg2$nonrisk),]
mg2$sim_fs_ct = 0
sp = sample(1:nrow(mg2),n_risk_genes,replace = F)
mg2[sp,]$sim_fs_ct = rpois(n_risk_genes,
fs_fraction * mg2[sp,]$sum_indel_rate * sim_gama)
mg2[-sp,]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes,
fs_fraction * mg2[-sp,]$sum_indel_rate)
p = mg2$risk
repeat{
mydata3 = p ### prior
# mydata4 = mg2$observed_frameshift_count
mydata4 = mg2$sim_fs_ct #### obs
mydata5 = fs_fraction * mg2$sum_indel_rate #### exp
p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式
# p=(exp(log(mydata3)+pois(mydata4,gama*mydata5)))/(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-5) break
}
q0 = 1-p ### risk probability of non-risk genes
mg2$post = p
mg2$q0 = q0
mg2 = mg2[order(mg2$q0),]
FDR <- numeric(nrow(mg2))
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR
if(rd > 30)break
gm.lst = c(gm.lst,gama)
}
plot(gm.lst)
sd(gm.lst)
mean(gm.lst)
- 不更新pi0
In [46]:
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
library(data.table)
rd = 0
options(scipen=200)
options(digits=22)
N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift
gm.lst = c()
#### par of simulation
n_risk_genes = 874
sim_gama = 50
# sim_gama = runif(1,min=30,max=100)
##########################
repeat{
rd = rd + 1
gama=100 ### initial gamma
Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate")
mg = merge(Mydata,IR,by="genename")
mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"
mg2$risk = rep(.05,nrow(mg2))
# mydata3 = rep(n_risk_genes/17478,nrow(mg2))
# p = mg2$risk
# p = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# mg2$risk = c(rep(1,n_risk_genes),rep(0,nrow(mg2)-n_risk_genes))
# p = runif(nrow(mg2),min=1e-10,max=1)
mg2$nonrisk = 1 - mg2$risk
mg2 = mg2[order(mg2$nonrisk),]
mg2$sim_fs_ct = 0
sp = sample(1:nrow(mg2),n_risk_genes,replace = F)
mg2[sp,]$sim_fs_ct = rpois(n_risk_genes,
fs_fraction * mg2[sp,]$sum_indel_rate * sim_gama)
mg2[-sp,]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes,
fs_fraction * mg2[-sp,]$sum_indel_rate)
p = mg2$risk
repeat{
mydata3 = mg2$risk ### prior
# mydata4 = mg2$observed_frameshift_count
mydata4 = mg2$sim_fs_ct #### obs
mydata5 = fs_fraction * mg2$sum_indel_rate #### exp
p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式
# p=(exp(log(mydata3)+pois(mydata4,gama*mydata5)))/(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-5) break
}
q0 = 1-p ### risk probability of non-risk genes
mg2$post = p
mg2$q0 = q0
mg2 = mg2[order(mg2$q0),]
FDR <- numeric(nrow(mg2))
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR
if(rd > 30)break
gm.lst = c(gm.lst,gama)
}
plot(gm.lst)
sd(gm.lst)
mean(gm.lst)
In [19]:
mg2$idx = 1:nrow(mg2)
fwrite(mg2,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/tmp/mg2.txt",sep="\t")
In [75]:
optim(.1,fr,method='Brent',lower=0.01,upper=1000,control=list('fnscale'=-1, "maxit" = 2000))
In [71]:
fr=function(x){
gama=x
mydata3 = p
# mydata4 = mg2$observed_frameshift_count
mydata4 = mg2$sim_fs_ct #### obs
mydata5 = fs_fraction * mg2$sum_indel_rate #### exp
log_P=sum(log(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5))) )
}
In [6]:
pois=function(k,lambda)
{
k*log(lambda)-lambda-log(factorial(k))
}
LL=log(exp(log(mydata3)+pois(mydata4,gama*mydata5))+exp(log(1-mydata3)+pois(mydata4,mydata5)))
In [ ]:
# n_risk_genes.lst = c(n_risk_genes.lst, n_risk_genes)
# mean.gm.lst = c(mean.gm.lst,mean(gm.lst))
# sd.gm.lst = c(sd.gm.lst, sd(gm.lst))
# z = data.frame(rcd_sim_gm ,n_risk_genes.lst,mean.gm.lst,sd.gm.lst)
# z
mg2$post = q
mg2$q0 = q0
mg2 = mg2[order(mg2$q0),]
FDR <- numeric(n)
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR
mg2$FDR2 = mg2$q0 * nrow(mg2) / rank(mg2$q0) # mg2$FDR3 = p.adjust(mg2$q0,method="fdr",n=nrow(mg2))
ct =nrow(mg2[mg2$FDR < 0.1,])
rcd_res_g = c(rcd_res_g,ct)
res_gama = gama
rcd_res_gm = c(rcd_res_gm, gama)
round = c(round,rd)
sim_risk_g = c(sim_risk_g,n_risk_genes)
dff = data.frame(round,rcd_sim_gm ,sim_risk_g,rcd_sim_fs,rcd_res_g ,rcd_res_gm)
# if(rd > 3)break
# }
dff
dff$err = abs(dff$rcd_sim_gm - dff$rcd_res_gm)/dff$rcd_sim_gm
max(dff$err)
library(ggplot2)
ggplot(data = dff, aes(round,err)) + geom_line() + ylim(0,1) + geom_hline(yintercept=mean(dff$err),linetype="dashed")