Notes:

Data structure array

EM rectify

Optim

Annotation 占总数的比值,是否过于conserved, overlap heatmap

Estimation of indel rate

SNV_rate: 2N * summation over all possible mutation rates of bases in window i

Y ~ possion($\mu$)

Y ~ possion($e^{\beta_0 + \beta_1 * SNV_rate + \beta_2 * GC_content}$)

log(obs indel count) ~ $\beta_0$ + $\beta_1$ * SNV_rate + $\beta_2$ * GC_content

$\mu$ indel rates over 2N individuals.

1
2
3
4
5
6
library(data.table)
options(digits=22)

df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)
nrow(df)
head(df,3)

17478

A data.frame: 3 × 5
genenameGC_contentSNV_rateindel_countlength
<fct><dbl><dbl><int><int>
1OR4F5 0.42810500000000001330490.000011560339999999999502430 918
2SAMD110.68268325834633403204070.0000779538648999999984542903205
3NOC2L 0.58277674038635096653850.0000999350169999999976252005539
1
2
3
4
5
6
7
8
9
10
11
N=rep(6430,nrow(df))

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(indel_count~SNV_2N + GC_content,family=poisson(link="log"),data=df)
print(summary(res))

coef=as.numeric(coef(res))
df$INDEL_RATE=exp(coef[1] + coef[2]*df$SNV_2N + coef[3]*df$GC_content) ##### b

# fwrite(df[,c(1,7,5)],'/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL',sep="\t",col.names=F)
Call:
glm(formula = indel_count ~ SNV_2N + GC_content, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                   Min                      1Q                  Median  
-5.3747723046756075149  -0.2721079831257477433  -0.2543498238112971155  
                    3Q                     Max  
-0.2414373113538021975   7.6654372768663812110  

Coefficients:
                           Estimate              Std. Error
(Intercept) -3.55875256114883642411  0.24395625412901122964
SNV_2N       0.32595843840840044159  0.01404195081335555921
GC_content  -0.26301045582160398340  0.47999476989334100008
                            z value Pr(>|z|)    
(Intercept) -14.5876699999999992485  < 2e-16 ***
SNV_2N       23.2131900000000008788  < 2e-16 ***
GC_content   -0.5479399999999999826  0.58373    
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4441.4194643153414290  on 17477  degrees of freedom
Residual deviance: 4263.1231826492557957  on 17475  degrees of freedom
AIC: 5387.1954605381806687

Number of Fisher Scoring iterations: 14

frameshift rate

$\frac{the , number , of ,frameshift}{the , number , of , indel}$
in unaffected siblings.

fs_fraction = 0.744186

Estimation of frameshift effect size

  • EM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
options(scipen=200)
options(digits=22)
gama= 2 ### initial gamma
N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift

Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate","length")

mg = merge(Mydata,IR,by="genename")

mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"

# p = mg2$risk
# p = rep(.05,nrow(mg2))
# p = runif(nrow(mg2),min=1e-10,max=1)

repeat{
mydata3 = p

mydata4 = mg2$observed_frameshift_count
# mydata5 = fs_fraction*2*N*mg2$sum_indel_rate
mydata5 = fs_fraction * mg2$sum_indel_rate

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg2$FDR2 = mg2$q0 * nrow(mg2) / rank(mg2$q0) # mg2$FDR3 = p.adjust(mg2$q0,method="fdr",n=nrow(mg2))

nrow(mg2[mg2$FDR<0.1,])

gama
log(gama)
1
readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/indel_EM.rds")
A data.frame: 2 × 4
n_genesgamalog_gmdesc
<dbl><dbl><dbl><fct>
136 3.7423539989335701250181.319714824909679951048update gama
43536.8931935018404999482303.608027076087319873210update gama and pi0
  • Q:

    • difference between FDR and FDR2

    • 若更新pi0,则初始pi0几乎无影响

  • validate gamma by simulation
1
2
3
4
5
mg2[1:n_risk_genes,]$sim_fs_ct = rpois(n_risk_genes,
fs_fraction * mg2[1:n_risk_genes,]$sum_indel_rate * sim_gama)

mg2[(n_risk_genes + 1):nrow(mg2),]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes,
fs_fraction * mg2[(n_risk_genes + 1):nrow(mg2),]$sum_indel_rate)
1
2
3
100 rounds 
n risk genes = all genes
res_gama(74) > sim_gama (50)

SNV model

1
2
pi0 = rep(0.05,n_genes)
update gama only(NaN)

New annotations

  • spliceAI
    • DS_AG Delta score (acceptor gain)
    • DS_AL Delta score (acceptor loss)
    • DS_DG Delta score (donor gain)
    • DS_DL Delta score (donor loss)
  • PPI
    • Interactome INSIDER (probably damaging) & MPC > 2
  • trap score (no responding)
  • deepsea clustering
1
2
setwd("/storage11_7T/fuy/TADA-A/annotation/spliceai")
readRDS("spliceai_sep_RR4_annota.rds")
A data.frame: 4 × 4
logRRlower_boundupper_boundannota
<dbl><dbl><dbl><chr>
2.0294776266614920068321 1.1561569801227173925895 2.902798273200266621075DS_AG (acceptor gain)
-0.9999999671218416930074-12.306234357864692441353310.306234423621010165562DS_AL (acceptor loss)
2.0682421448743815162175 1.2534159904270909535740 2.883068299321672078861DS_DG (donor gain)
0.3651119660030998637090 -0.5284987397323961388906 1.258722671738595977331DS_DL (donor loss)
1
readRDS("/storage11_7T/fuy/TADA-A/annotation/PPI/PPI_rr.rds")
A data.frame: 1 × 4
logRRlower_boundupper_boundannota
<dbl><dbl><dbl><chr>
2.9395150000000001000442.5703409999999999868693.308688000000000073442Interactome INSIDER (probably damaging) & MPC > 2

CHD data