2021-03-18-tadaA-report

Notes:

Data structure array

EM rectify

Optim

Annotation 占总数的比值，是否过于conserved, overlap heatmap

Estimation of indel rate

SNV_rate: 2N * summation over all possible mutation rates of bases in window i

Y ~ possion($\mu$)

Y ~ possion($e^{\beta_0 + \beta_1 * SNV_rate + \beta_2 * GC_content}$)

log(obs indel count) ~ $\beta_0$ + $\beta_1$ * SNV_rate + $\beta_2$ * GC_content

$\mu$ indel rates over 2N individuals.

library(data.table)
options(digits=22)

df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)
nrow(df)
head(df,3)

17478

A data.frame: 3 × 5
	genename	GC_content	SNV_rate	indel_count	length
	<fct>	<dbl>	<dbl>	<int>	<int>
1	OR4F5	0.4281050000000000133049	0.00001156033999999999950243	0	918
2	SAMD11	0.6826832583463340320407	0.00007795386489999999845429	0	3205
3	NOC2L	0.5827767403863509665385	0.00009993501699999999762520	0	5539

N=rep(6430,nrow(df))

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(indel_count~SNV_2N + GC_content,family=poisson(link="log"),data=df) 
print(summary(res))

coef=as.numeric(coef(res)) 
df$INDEL_RATE=exp(coef[1] + coef[2]*df$SNV_2N + coef[3]*df$GC_content) ##### b

# fwrite(df[,c(1,7,5)],'/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL',sep="\t",col.names=F)

Call:
glm(formula = indel_count ~ SNV_2N + GC_content, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                   Min                      1Q                  Median  
-5.3747723046756075149  -0.2721079831257477433  -0.2543498238112971155  
                    3Q                     Max  
-0.2414373113538021975   7.6654372768663812110  

Coefficients:
                           Estimate              Std. Error
(Intercept) -3.55875256114883642411  0.24395625412901122964
SNV_2N       0.32595843840840044159  0.01404195081335555921
GC_content  -0.26301045582160398340  0.47999476989334100008
                            z value Pr(>|z|)    
(Intercept) -14.5876699999999992485  < 2e-16 ***
SNV_2N       23.2131900000000008788  < 2e-16 ***
GC_content   -0.5479399999999999826  0.58373    
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4441.4194643153414290  on 17477  degrees of freedom
Residual deviance: 4263.1231826492557957  on 17475  degrees of freedom
AIC: 5387.1954605381806687

Number of Fisher Scoring iterations: 14

frameshift rate

$\frac{the , number , of ,frameshift}{the , number , of , indel}$
in unaffected siblings.

fs_fraction = 0.744186

Estimation of frameshift effect size

setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/test")
options(scipen=200)
options(digits=22)
gama= 2 ### initial gamma
N = 6430 ### sample size
fs_fraction = 0.744186 ### fraction of frameshift

Mydata=read.table('17478_gene_obs_count_risk_prior.txt',header=T)
Mydata1=read.table('prior',header=T)
IR=read.table('/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/INDEL')
colnames(IR) = c("genename","sum_indel_rate","length")

mg = merge(Mydata,IR,by="genename")

mg2 = merge(mg,Mydata1,by="genename")[,-3]
colnames(mg2)[2] = "observed_frameshift_count"

# p = mg2$risk
# p = rep(.05,nrow(mg2))
# p = runif(nrow(mg2),min=1e-10,max=1)

repeat{
  mydata3 = p

  mydata4 = mg2$observed_frameshift_count          
#   mydata5 = fs_fraction*2*N*mg2$sum_indel_rate
  mydata5 = fs_fraction * mg2$sum_indel_rate
    
  p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
                                                          mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

  tes=gama
  gama=sum(p*mydata4)/sum(p*mydata5)
  if(abs(gama-tes)/tes < 1e-8) break
  
}

q0 = 1-p  ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg2$FDR2 =  mg2$q0 * nrow(mg2) / rank(mg2$q0)  # mg2$FDR3 = p.adjust(mg2$q0,method="fdr",n=nrow(mg2))

nrow(mg2[mg2$FDR<0.1,])

gama
log(gama)

1	readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/indel_EM.rds")

A data.frame: 2 × 4
n_genes	gama	log_gm	desc
<dbl>	<dbl>	<dbl>	<fct>
136	3.742353998933570125018	1.319714824909679951048	update gama
435	36.893193501840499948230	3.608027076087319873210	update gama and pi0

Q:
- difference between FDR and FDR2
- 若更新pi0，则初始pi0几乎无影响

validate gamma by simulation

mg2[1:n_risk_genes,]$sim_fs_ct = rpois(n_risk_genes,
                                       fs_fraction * mg2[1:n_risk_genes,]$sum_indel_rate * sim_gama)
    
mg2[(n_risk_genes + 1):nrow(mg2),]$sim_fs_ct = rpois(nrow(mg2) - n_risk_genes, 
                                                     fs_fraction * mg2[(n_risk_genes + 1):nrow(mg2),]$sum_indel_rate)

1
2
3

100 rounds 
n risk genes = all genes
res_gama(74) > sim_gama (50)

SNV model

1 2	pi0 = rep(0.05,n_genes) update gama only(NaN)

New annotations

spliceAI
- DS_AG Delta score (acceptor gain)
- DS_AL Delta score (acceptor loss)
- DS_DG Delta score (donor gain)
- DS_DL Delta score (donor loss)

PPI
- Interactome INSIDER (probably damaging) & MPC > 2
trap score (no responding)
deepsea clustering

1 2	setwd("/storage11_7T/fuy/TADA-A/annotation/spliceai") readRDS("spliceai_sep_RR4_annota.rds")

A data.frame: 4 × 4
logRR	lower_bound	upper_bound	annota
<dbl>	<dbl>	<dbl>	<chr>
2.0294776266614920068321	1.1561569801227173925895	2.902798273200266621075	DS_AG (acceptor gain)
-0.9999999671218416930074	-12.3062343578646924413533	10.306234423621010165562	DS_AL (acceptor loss)
2.0682421448743815162175	1.2534159904270909535740	2.883068299321672078861	DS_DG (donor gain)
0.3651119660030998637090	-0.5284987397323961388906	1.258722671738595977331	DS_DL (donor loss)

1	readRDS("/storage11_7T/fuy/TADA-A/annotation/PPI/PPI_rr.rds")

A data.frame: 1 × 4
logRR	lower_bound	upper_bound	annota
<dbl>	<dbl>	<dbl>	<chr>
2.939515000000000100044	2.570340999999999986869	3.308688000000000073442	Interactome INSIDER (probably damaging) & MPC > 2

CHD data

2871 CHD probands from Contribution of rare inherited and de novo variants in 2,871 congenital heart disease probands