num of nonframeshift mutations (inframe_deletion, inframe_insertion) from the affected
siblings中nonfs数量太少(35个),glm得到的结果不显著,因此用affected中的nonfs
1 2 3 4 5 6
| library(data.table) options(digits=22) options(scipen=200) nonfs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_gene_count") nrow(nonfs) head(nonfs,2)
|
121
A data.table: 2 × 2
| genename | nonfs_count |
| <chr> | <int> |
| PRKAA1 | 1 |
| CDC42BPB | 1 |
1 2
| 456/125 ### fs / nonfs from the affected 128/35 ### fs / nonfs from the siblings
|
3.648
3.65714285714286
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| library(data.table) options(digits=22) N = 6430 #2179 ### sample size df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]
df = merge(df,nonfs,by="genename",all=T) df = df[,c(1:3,5,4)] df[is.na(df$nonfs_count),]$nonfs_count = 0
df$SNV_2N = 2 * N * df$SNV_rate
res=glm(nonfs_count~SNV_2N+GC_content,family=poisson(link="log"),data=df)
print(summary(res))
coef=as.numeric(coef(res)) df$nonfs_rate_2N=exp(coef[1] + coef[2]*df$SNV_2N + coef[2]*df$GC_content) ##### b df$nonfs_rate = df$nonfs_rate_2N / (2*N) df2 = df[,c(1,4,8)] head(df2,2)
fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate",sep="\t")
|
Call:
glm(formula = nonfs_count ~ SNV_2N + GC_content, family = poisson(link = "log"),
data = df)
Deviance Residuals:
Min 1Q Median
-1.9683925830598842932 -0.1272313332874517033 -0.1143798048596960659
3Q Max
-0.1042068278839288625 5.2929133246399411306
Coefficients:
Estimate Std. Error
(Intercept) -6.53030413476315008126 0.55631549988878503665
SNV_2N 0.31405285946720901125 0.03755344202928198594
GC_content 2.47619352457614372298 1.05085533264888542071
z value Pr(>|z|)
(Intercept) -11.738490000000000535 < 0.0000000000000002 ***
SNV_2N 8.362819999999999254 < 0.0000000000000002 ***
GC_content 2.356360000000000010 0.018455 *
---
Signif. codes:
0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1247.2329733977853721 on 17477 degrees of freedom
Residual deviance: 1217.6875697627915542 on 17475 degrees of freedom
AIC: 1467.9068262469991168
Number of Fisher Scoring iterations: 7
A data.frame: 2 × 3
| genename | nonfs_count | nonfs_rate |
| <fct> | <dbl> | <dbl> |
| 1 | A1BG | 0 | 0.0000001796438737878756431065 |
| 2 | A1CF | 0 | 0.0000001958049289737441285481 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| # setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/") # df2 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate")[,c(1,4,8)] # genelst = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-21_selected14_estim_pi_all_genes.rds")[,1:2] # mg2 = merge(genelst,df2,by="genename")
# fs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/456in_wd.frameshift_aff.bed")[,c(1,2,5)] # fs$mg = paste(fs$chr,fs$end,sep="_")
# fs_count = c() # for(i in unique(fs$GENE_NAME)){ # fs_count = c(fs_count, nrow(fs[fs$GENE_NAME == i,])) # } # df.fs = data.frame(genename = unique(fs$GENE_NAME),fs_count = fs_count ) # fwrite(df.fs,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/count_456in_wd_frameshift_gene",sep="\t")
# mg2 = merge(mg2,df.fs,by="genename",all=T) # mg2[is.na(mg2$fs_count),]$fs_count = 0
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| gama= 30 ### initial gamma N = 6430 ### sample size fs_fraction = 456/125 ### ratio of fs to nonfs
repeat{ mydata3 = mg2$prior ### posterior probability from SNV model
mydata4 = mg2$fs_count ### count of fs from the affected
mydata5 = fs_fraction * mg2$nonfs_rate * 2 * N ### frameshift rate * 2 * N p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+ mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式
tes=gama gama=sum(p*mydata4)/sum(p*mydata5) if(abs(gama-tes)/tes < 1e-8) break }
q0 = 1-p ### risk probability of non-risk genes
mg2$post = p mg2$q0 = q0
mg2 = mg2[order(mg2$q0),]
FDR = c() for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i mg2$FDR = FDR
mg3 = mg2[mg2$FDR<0.1,] cat("num of risk genes:",nrow(mg3),"\n")
cat("RR:" ,gama,"\n") cat("logRR:",log(gama))
|
num of risk genes: 160
RR: 18.26156781374253412764
logRR: 2.904798732036466990536