1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
library(data.table)
options(digits=22)
N = 2179 ### sample size
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]

df = merge(df,nonfs,by="genename",all=T)
df = df[,c(1:3,5,4)]
df[is.na(df$nonfs_count),]$nonfs_count = 0 #### num of observed nonfs in siblings

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(nonfs_count~SNV_2N,family=poisson(link="log"),data=df)

print(summary(res))

coef=as.numeric(coef(res))
df$nonfs_rate_2N=exp(coef[1] + coef[2]*df$SNV_2N) ##### b
df2 = df[,c(1,4,7)]
head(df2,2)
Call:
glm(formula = nonfs_count ~ SNV_2N, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                    Min                       1Q                   Median  
-0.67401072475467482814  -0.07119659925706947612  -0.06765228295833836114  
                     3Q                      Max  
-0.06511511531568234257   3.23975915589543239648  

Coefficients:
                          Estimate             Std. Error
(Intercept) -6.2828026906051723799  0.1933407844586273738
SNV_2N       0.7279000218031188574  0.2972449860550154810
                           z value             Pr(>|z|)    
(Intercept) -32.496000000000002217 < 0.0000000000000002 ***
SNV_2N        2.448819999999999997             0.014332 *  
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 506.60640306394611798  on 17477  degrees of freedom
Residual deviance: 503.79217682117155164  on 17476  degrees of freedom
AIC: 591.79217682117155164

Number of Fisher Scoring iterations: 8
A data.frame: 2 × 3
genenamenonfs_countnonfs_rate_2N
<fct><dbl><dbl>
1A1BG00.002328933656623771893124
2A1CF00.002619080598964510127324

EM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
gama= 20 ### initial gamma
N = 2179 ### sample size
fs_fraction = 128/42 ### num of frameshift / num of nonfs in siblings

repeat{
mydata3 = mg2$prior ### posterior from SNV model

mydata4 = mg2$fs_count ### number of observed frameshift in probands

mydata5 = fs_fraction * mg2$nonfs_rate_2N ### frameshift rate * 2N

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
nrow(mg3)

gama
log(gama)

172

22.4947944826454

3.1132839261165