inframe rate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# all4 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/125inframe_inDel_affected_allinfo.txt")
# inframe_count = c()
# for(i in unique(all4$GENE_NAME)){
# inframe_count = c(inframe_count, nrow(all4[all4$GENE_NAME == i,]))
# }
# inframe = data.frame(genename = unique(all4$GENE_NAME),inframe_count=inframe_count)
# fwrite(inframe,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_gene_count",sep="\t")
# sum(inframe$inframe_count)

library(data.table)
options(digits=22)
N = 6430 ############## sample size of the affected
scale = 0.870705990962538 ############## scale of SNV rate
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]

df = merge(df,inframe,by="genename",all=T)
df = df[,c(1:3,5,4)]
df[is.na(df$inframe_count),]$inframe_count = 0

df$SNV_2N = 2 * N * df$SNV_rate * scale ###### 每个window中SNV rate的总和 * 2N

res=glm(inframe_count~SNV_2N+GC_content,family=poisson(link="log"),data=df) ######### glm

print(summary(res))
saveRDS(res,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_rate_glm.rds")

coef=as.numeric(coef(res))

df$inframe_rate_2N= exp(coef[1] + coef[2]*df$SNV_2N + coef[2]*df$GC_content) ##### inframe rate * 2N

df$inframe_rate = df$inframe_rate_2N / (2*N)

df2 = df[,c(1,4,8)]
head(df2,2)

fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_aff_mutrate",sep="\t")

125

Call:
glm(formula = inframe_count ~ SNV_2N + GC_content, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                   Min                      1Q                  Median  
-1.9683925830598831830  -0.1272313332874516756  -0.1143798048596960104  
                    3Q                     Max  
-0.1042068278839288209   5.2929133246399411306  

Coefficients:
                           Estimate              Std. Error
(Intercept) -6.53030413476315008126  0.55631549988878514768
SNV_2N       0.36068760606554856052  0.04312987669668819773
GC_content   2.47619352457614327889  1.05085533264888542071
                           z value Pr(>|z|)    
(Intercept) -11.738490000000000535  < 2e-16 ***
SNV_2N        8.362819999999999254  < 2e-16 ***
GC_content    2.356360000000000010 0.018455 *  
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1247.2329733977853721  on 17477  degrees of freedom
Residual deviance: 1217.6875697627915542  on 17475  degrees of freedom
AIC: 1467.9068262469991168

Number of Fisher Scoring iterations: 7
A data.frame: 2 × 3
genenameinframe_countinframe_rate
<fct><dbl><dbl>
1A1BG01.844887059681646876202e-07
2A1CF01.992030509391888446402e-07

EM

1
2
456/125 ### ratio of fs to inframe in the affected
128/35 ### ratio of fs to inframe in the unaffected

3.648

3.65714285714286

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
gama= 20 ### initial gamma
N = 6430 ### sample size of the affected
fs_fraction = 128/35 ### (framshift / inframe) from the siblings

repeat{
mydata3 = mg2$prior #### posterior from SNV data

mydata4 = mg2$fs_count #### count of frameshift in the affected

mydata5 = fs_fraction * mg2$inframe_rate * 2 * N ##### 2*N*frameshift rate

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
nrow(mg3)

gama
log(gama)

158

17.6733618110954

2.87205852353222