num of nonframeshift mutations (inframe_deletion, inframe_insertion) from the affected

siblings中nonfs数量太少(35个),glm得到的结果不显著,因此用affected中的nonfs

1
2
3
4
5
6
library(data.table)
options(digits=22)
options(scipen=200)
nonfs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_gene_count")
nrow(nonfs)
head(nonfs,2)

121

A data.table: 2 × 2
genenamenonfs_count
<chr><int>
PRKAA1 1
CDC42BPB1
1
2
456/125 ### fs / nonfs from the affected
128/35 ### fs / nonfs from the siblings

3.648

3.65714285714286

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
library(data.table)
options(digits=22)
N = 6430 #2179 ### sample size
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]

df = merge(df,nonfs,by="genename",all=T)
df = df[,c(1:3,5,4)]
df[is.na(df$nonfs_count),]$nonfs_count = 0

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(nonfs_count~SNV_2N+GC_content,family=poisson(link="log"),data=df)

print(summary(res))

coef=as.numeric(coef(res))
df$nonfs_rate_2N=exp(coef[1] + coef[2]*df$SNV_2N + coef[2]*df$GC_content) ##### b
df$nonfs_rate = df$nonfs_rate_2N / (2*N)
df2 = df[,c(1,4,8)]
head(df2,2)

fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate",sep="\t")
Call:
glm(formula = nonfs_count ~ SNV_2N + GC_content, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                   Min                      1Q                  Median  
-1.9683925830598842932  -0.1272313332874517033  -0.1143798048596960659  
                    3Q                     Max  
-0.1042068278839288625   5.2929133246399411306  

Coefficients:
                           Estimate              Std. Error
(Intercept) -6.53030413476315008126  0.55631549988878503665
SNV_2N       0.31405285946720901125  0.03755344202928198594
GC_content   2.47619352457614372298  1.05085533264888542071
                           z value             Pr(>|z|)    
(Intercept) -11.738490000000000535 < 0.0000000000000002 ***
SNV_2N        8.362819999999999254 < 0.0000000000000002 ***
GC_content    2.356360000000000010             0.018455 *  
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1247.2329733977853721  on 17477  degrees of freedom
Residual deviance: 1217.6875697627915542  on 17475  degrees of freedom
AIC: 1467.9068262469991168

Number of Fisher Scoring iterations: 7
A data.frame: 2 × 3
genenamenonfs_countnonfs_rate
<fct><dbl><dbl>
1A1BG00.0000001796438737878756431065
2A1CF00.0000001958049289737441285481
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/")
# df2 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate")[,c(1,4,8)]
# genelst = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-21_selected14_estim_pi_all_genes.rds")[,1:2]
# mg2 = merge(genelst,df2,by="genename")

# fs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/456in_wd.frameshift_aff.bed")[,c(1,2,5)]
# fs$mg = paste(fs$chr,fs$end,sep="_")

# fs_count = c()
# for(i in unique(fs$GENE_NAME)){

# fs_count = c(fs_count, nrow(fs[fs$GENE_NAME == i,]))
# }

# df.fs = data.frame(genename = unique(fs$GENE_NAME),fs_count = fs_count )
# fwrite(df.fs,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/count_456in_wd_frameshift_gene",sep="\t")

# mg2 = merge(mg2,df.fs,by="genename",all=T)
# mg2[is.na(mg2$fs_count),]$fs_count = 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
gama= 30 ### initial gamma
N = 6430 ### sample size
fs_fraction = 456/125 ### ratio of fs to nonfs

repeat{
mydata3 = mg2$prior ### posterior probability from SNV model

mydata4 = mg2$fs_count ### count of fs from the affected

mydata5 = fs_fraction * mg2$nonfs_rate * 2 * N ### frameshift rate * 2 * N

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
cat("num of risk genes:",nrow(mg3),"\n")

cat("RR:" ,gama,"\n")
cat("logRR:",log(gama))
num of risk genes: 160 
RR: 18.26156781374253412764 
logRR: 2.904798732036466990536