2021-06-15-ASD-fs-rate-gama-report

num of nonframeshift mutations (inframe_deletion, inframe_insertion) from the affected

siblings中nonfs数量太少（35个），glm得到的结果不显著，因此用affected中的nonfs

library(data.table)
options(digits=22)
options(scipen=200)
nonfs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_gene_count") 
nrow(nonfs)
head(nonfs,2)

121

A data.table: 2 × 2
genename	nonfs_count
<chr>	<int>
PRKAA1	1
CDC42BPB	1

1 2	456/125 ### fs / nonfs from the affected 128/35 ### fs / nonfs from the siblings

3.648

3.65714285714286

library(data.table)
options(digits=22)
N = 6430 #2179 ### sample size
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]

df = merge(df,nonfs,by="genename",all=T)
df = df[,c(1:3,5,4)]
df[is.na(df$nonfs_count),]$nonfs_count = 0

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(nonfs_count~SNV_2N+GC_content,family=poisson(link="log"),data=df)  

print(summary(res))

coef=as.numeric(coef(res)) 
df$nonfs_rate_2N=exp(coef[1] + coef[2]*df$SNV_2N + coef[2]*df$GC_content) ##### b
df$nonfs_rate = df$nonfs_rate_2N / (2*N)
df2 = df[,c(1,4,8)]
head(df2,2)

fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate",sep="\t")

Call:
glm(formula = nonfs_count ~ SNV_2N + GC_content, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                   Min                      1Q                  Median  
-1.9683925830598842932  -0.1272313332874517033  -0.1143798048596960659  
                    3Q                     Max  
-0.1042068278839288625   5.2929133246399411306  

Coefficients:
                           Estimate              Std. Error
(Intercept) -6.53030413476315008126  0.55631549988878503665
SNV_2N       0.31405285946720901125  0.03755344202928198594
GC_content   2.47619352457614372298  1.05085533264888542071
                           z value             Pr(>|z|)    
(Intercept) -11.738490000000000535 < 0.0000000000000002 ***
SNV_2N        8.362819999999999254 < 0.0000000000000002 ***
GC_content    2.356360000000000010             0.018455 *  
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1247.2329733977853721  on 17477  degrees of freedom
Residual deviance: 1217.6875697627915542  on 17475  degrees of freedom
AIC: 1467.9068262469991168

Number of Fisher Scoring iterations: 7

A data.frame: 2 × 3
	genename	nonfs_count	nonfs_rate
	<fct>	<dbl>	<dbl>
1	A1BG	0	0.0000001796438737878756431065
2	A1CF	0	0.0000001958049289737441285481

# setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/")
# df2 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/nonfs_unaff_mutrate")[,c(1,4,8)]
# genelst = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-21_selected14_estim_pi_all_genes.rds")[,1:2]
# mg2 = merge(genelst,df2,by="genename")

# fs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/456in_wd.frameshift_aff.bed")[,c(1,2,5)]
# fs$mg = paste(fs$chr,fs$end,sep="_")

# fs_count = c()
# for(i in unique(fs$GENE_NAME)){
    
#     fs_count = c(fs_count, nrow(fs[fs$GENE_NAME == i,]))
# }
                 
# df.fs = data.frame(genename = unique(fs$GENE_NAME),fs_count = fs_count )
# fwrite(df.fs,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/count_456in_wd_frameshift_gene",sep="\t")

# mg2 = merge(mg2,df.fs,by="genename",all=T)
# mg2[is.na(mg2$fs_count),]$fs_count = 0

gama= 30 ### initial gamma
N = 6430 ### sample size
fs_fraction = 456/125  ### ratio of fs to nonfs

repeat{
  mydata3 = mg2$prior ### posterior probability from SNV model

  mydata4 = mg2$fs_count ### count of fs from the affected         

  mydata5 = fs_fraction * mg2$nonfs_rate * 2 * N ### frameshift rate * 2 * N
    
  p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
                                                          mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

  tes=gama
  gama=sum(p*mydata4)/sum(p*mydata5)
  if(abs(gama-tes)/tes < 1e-8) break
  
}

q0 = 1-p  ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
cat("num of risk genes:",nrow(mg3),"\n")

cat("RR:" ,gama,"\n")
cat("logRR:",log(gama))

num of risk genes: 160 
RR: 18.26156781374253412764 
logRR: 2.904798732036466990536