inframe rate
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| # all4 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/125inframe_inDel_affected_allinfo.txt") # inframe_count = c() # for(i in unique(all4$GENE_NAME)){ # inframe_count = c(inframe_count, nrow(all4[all4$GENE_NAME == i,])) # } # inframe = data.frame(genename = unique(all4$GENE_NAME),inframe_count=inframe_count) # fwrite(inframe,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_gene_count",sep="\t") # sum(inframe$inframe_count)
library(data.table) options(digits=22) N = 6430 ############## sample size of the affected scale = 0.870705990962538 ############## scale of SNV rate df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]
df = merge(df,inframe,by="genename",all=T) df = df[,c(1:3,5,4)] df[is.na(df$inframe_count),]$inframe_count = 0
df$SNV_2N = 2 * N * df$SNV_rate * scale ###### 每个window中SNV rate的总和 * 2N
res=glm(inframe_count~SNV_2N+GC_content,family=poisson(link="log"),data=df) ######### glm
print(summary(res)) saveRDS(res,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_rate_glm.rds")
coef=as.numeric(coef(res))
df$inframe_rate_2N= exp(coef[1] + coef[2]*df$SNV_2N + coef[2]*df$GC_content) ##### inframe rate * 2N
df$inframe_rate = df$inframe_rate_2N / (2*N)
df2 = df[,c(1,4,8)] head(df2,2)
fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/inframe_aff_mutrate",sep="\t")
|
125
Call:
glm(formula = inframe_count ~ SNV_2N + GC_content, family = poisson(link = "log"),
data = df)
Deviance Residuals:
Min 1Q Median
-1.9683925830598831830 -0.1272313332874516756 -0.1143798048596960104
3Q Max
-0.1042068278839288209 5.2929133246399411306
Coefficients:
Estimate Std. Error
(Intercept) -6.53030413476315008126 0.55631549988878514768
SNV_2N 0.36068760606554856052 0.04312987669668819773
GC_content 2.47619352457614327889 1.05085533264888542071
z value Pr(>|z|)
(Intercept) -11.738490000000000535 < 2e-16 ***
SNV_2N 8.362819999999999254 < 2e-16 ***
GC_content 2.356360000000000010 0.018455 *
---
Signif. codes:
0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1247.2329733977853721 on 17477 degrees of freedom
Residual deviance: 1217.6875697627915542 on 17475 degrees of freedom
AIC: 1467.9068262469991168
Number of Fisher Scoring iterations: 7
A data.frame: 2 × 3
| genename | inframe_count | inframe_rate |
| <fct> | <dbl> | <dbl> |
| 1 | A1BG | 0 | 1.844887059681646876202e-07 |
| 2 | A1CF | 0 | 1.992030509391888446402e-07 |
EM
1 2
| 456/125 ### ratio of fs to inframe in the affected 128/35 ### ratio of fs to inframe in the unaffected
|
3.648
3.65714285714286
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| gama= 20 ### initial gamma N = 6430 ### sample size of the affected fs_fraction = 128/35 ### (framshift / inframe) from the siblings
repeat{ mydata3 = mg2$prior #### posterior from SNV data
mydata4 = mg2$fs_count #### count of frameshift in the affected
mydata5 = fs_fraction * mg2$inframe_rate * 2 * N ##### 2*N*frameshift rate p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+ mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式
tes=gama gama=sum(p*mydata4)/sum(p*mydata5) if(abs(gama-tes)/tes < 1e-8) break }
q0 = 1-p ### risk probability of non-risk genes
mg2$post = p mg2$q0 = q0
mg2 = mg2[order(mg2$q0),]
FDR = c() for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i mg2$FDR = FDR
mg3 = mg2[mg2$FDR<0.1,] nrow(mg3)
gama log(gama)
|
158
17.6733618110954
2.87205852353222