indel rate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
library(data.table)
options(digits=22)
sp = 6430 ### sample size
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]
df = merge(df,id,by="genename")
df = df[,c(1:3,5,4)]

N=rep(sp,nrow(df))
df$SNV_2N = 2 * N * df$SNV_rate

res=glm(observed_indel_count~SNV_2N,family=poisson(link="log"),data=df)
print(summary(res))

coef=as.numeric(coef(res))
df$INDEL_RATE=exp(coef[1] + coef[2]*df$SNV_2N) ##### b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/")

options(scipen=200)
options(digits=22)

obs_indel = fread("gene_indel_ct_obs.txt")

Mydata = g2[,1:2]
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate","length")

mg = merge(Mydata,IR,by="genename")

mg2 = merge(mg,obs_indel,by="genename")
colnames(mg2)[5] = "observed_frameshift_count"

EM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
gama= 20 ### initial gamma
N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift (count framshifts in unaffected / count indels in unaffected)

repeat{
mydata3 = mg2$prior

mydata4 = mg2$observed_frameshift_count

mydata5 = fs_fraction * mg2$sum_indel_rate

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
nrow(mg3)

gama
log(gama)