1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(data.table)

all = fread("/storage11_7T/fuy/TADA-A/cell_WES/denovo_all.csv")[,c(2,8,13,14,16,17,32,33,34)]
all2 = all[all$Affected_Status == 2,] #### unaff
nrow(all2)
all2 = all2[!is.na(all2$pLI),] ### pLI
nrow(all2)
all2 = all2[all2$Coding,] ### Coding
nrow(all2)

all2 = all2[all2$Variant_type == 'Indel',] ### indel
nrow(all2)

library(tidyr)
all2 = separate(all2,Variant,sep=":",into=c("chr","end","ref","alt"))

all2 = all2[all2$chr %in% as.character(seq(1,22)),] ### autosome
nrow(all2)

all2$chr = paste0("chr",all2$chr)
all2$end = as.numeric(all2$end)
all2$start = all2$end - 1
# tmp = unique(all2[,c(1,13,2,3,4,5,6,12,8)])
# head(tmp,2)
head(all2,2)
# fwrite(all2,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/172indel_unaffected.cd.auto.no_pli_rm.allinfo.txt",sep="\t")

all3 = all2[all2$VEP_functional_class_canonical_simplified == 'frameshift_variant',]
nrow(all3)
head(all3,2)

# fwrite(all3,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/128frameshift_unaff_allinfo.txt",sep="\t")

rm nonfs-unaff not in wd

1
2
3
4
5
6
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected")
unaff.nonfs = fread("44nonframeshift_unaff_allinfo.txt")
unaff.nonfs$start = unaff.nonfs$end - 1
unaff.nonfs2 = unique(unaff.nonfs[,c(1,13,2)])
fwrite(unaff.nonfs2,"44nonframeshift_unaff_unique.bed",sep="\t",col.names=F)
head(unaff.nonfs2)
1
2
3
cd /storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected
bedtools subtract -a 44nonframeshift_unaff_unique.bed -b \
/storage11_7T/fuy/TADA-A/db/MS_data/windows_partition/uq.mg.st.17478gene_hg19.87.auto.exon.wd.bed > not_in_wd.44nonframeshift_unaff_unique.bed
1
2
3
4
5
6
7
8
9
10
notin = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/not_in_wd.44nonframeshift_unaff_unique.bed")
notin$mg = paste(notin$V1,notin$V3,sep="_")
nrow(notin)
unaff.nonfs$mg = paste(unaff.nonfs$chr,unaff.nonfs$end,sep="_")
nrow(unaff.nonfs)
unaff.nonfs.wd = unaff.nonfs[!(unaff.nonfs$mg %in% notin$mg),]
nrow(unaff.nonfs.wd)
head(unaff.nonfs.wd,2)

fwrite(unaff.nonfs.wd, "/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/42in_wd.nonframeshift_unaff.bed",sep="\t")
1
2
3
4
all4 = unaff.nonfs.wd[,c(1,2,6)]
nrow(all4)
all4$mg = paste(all4$chr,all4$end,sep="_")
head(all4,2)

42

A data.table: 2 × 4
chrendGENE_NAMEmg
<chr><int><chr><chr>
chr4 48523229FRYLchr4_48523229
chr1211420400PRB3chr12_11420400
1
2
3
4
5
6
7
8
nonfs_count = c()
for(i in unique(all4$GENE_NAME)){
nonfs_count = c(nonfs_count, nrow(all4[all4$GENE_NAME == i,]))
}

nonfs = data.frame(genename = unique(all4$GENE_NAME),nonfs_count=nonfs_count)
fwrite(nonfs,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/nonfs_gene_count",sep="\t")
nrow(nonfs)

42

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
library(data.table)
options(digits=22)
N = 2179 ### sample size
df = read.table("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/gene_level_GC_SNV_rate_indel_ct_len.txt",header=T)[,-4]

df = merge(df,nonfs,by="genename",all=T)
df = df[,c(1:3,5,4)]
df[is.na(df$nonfs_count),]$nonfs_count = 0
# df$nonfs_count = df$nonfs_count / (2*N)

df$SNV_2N = 2 * N * df$SNV_rate

res=glm(nonfs_count~SNV_2N,family=poisson(link="log"),data=df)

print(summary(res))

coef=as.numeric(coef(res))
df$nonfs_rate_2N=exp(coef[1] + coef[2]*df$SNV_2N) ##### b
df2 = df[,c(1,4,7)]
head(df2,2)
Call:
glm(formula = nonfs_count ~ SNV_2N, family = poisson(link = "log"), 
    data = df)

Deviance Residuals: 
                    Min                       1Q                   Median  
-0.67401072475467482814  -0.07119659925706947612  -0.06765228295833836114  
                     3Q                      Max  
-0.06511511531568234257   3.23975915589543239648  

Coefficients:
                          Estimate             Std. Error
(Intercept) -6.2828026906051723799  0.1933407844586273738
SNV_2N       0.7279000218031188574  0.2972449860550154810
                           z value             Pr(>|z|)    
(Intercept) -32.496000000000002217 < 0.0000000000000002 ***
SNV_2N        2.448819999999999997             0.014332 *  
---
Signif. codes:  
  0 ‘***’ 0.001000000000000000020817 ‘**’ 0.01000000000000000020817 ‘*’
  0.05000000000000000277556 ‘.’ 0.1000000000000000055511 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 506.60640306394611798  on 17477  degrees of freedom
Residual deviance: 503.79217682117155164  on 17476  degrees of freedom
AIC: 591.79217682117155164

Number of Fisher Scoring iterations: 8
A data.frame: 2 × 3
genenamenonfs_countnonfs_rate_2N
<fct><dbl><dbl>
1A1BG00.002328933656623771893124
2A1CF00.002619080598964510127324
1
head(df,2)
A data.frame: 2 × 7
genenameGC_contentSNV_ratenonfs_countlengthSNV_2Nnonfs_rate_2N
<fct><dbl><dbl><dbl><int><dbl><dbl>
1A1BG0.57064393235147303773400.0000694971091999999968699040060.30286840189359998110060.002328933656623771893124
2A1CF0.36894716828074597403390.0001065103076999999960400096030.46417192095659998152260.002619080598964510127324
1
2
mean(df$SNV_2N)
median(df$SNV_2N)

0.321189451590518

0.2794803710137

1
fwrite(df,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/nonfs_unaff_mutrate",sep="\t")

rm not in wd for frameshift of unaff

1
2
3
4
5
6
7
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected")
unaff.fs = fread("128frameshift_unaff_allinfo.txt")
unaff.fs$start = unaff.fs$end - 1
unaff.fs2 = unique(unaff.fs[,c(1,13,2)])
fwrite(unaff.fs2,"127frameshift_unaff_unique.bed",sep="\t",col.names=F)
nrow(unaff.fs2)
head(unaff.fs2,2)

127

A data.table: 2 × 3
chrstartend
<chr><dbl><int>
chr12120135792120135793
chr12 51470314 51470315
1
2
3
cd /storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected
bedtools subtract -a 127frameshift_unaff_unique.bed -b \
/storage11_7T/fuy/TADA-A/db/MS_data/windows_partition/uq.mg.st.17478gene_hg19.87.auto.exon.wd.bed > not_in_wd.127frameshift_unaff_unique.bed
1
2
3
4
5
6
7
8
9
10
notin = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/not_in_wd.127frameshift_unaff_unique.bed")
notin$mg = paste(notin$V1,notin$V3,sep="_")
nrow(notin)
unaff.fs$mg = paste(unaff.fs$chr,unaff.fs$end,sep="_")
nrow(unaff.fs)
unaff.fs.wd = unaff.fs[!(unaff.fs$mg %in% notin$mg),]
nrow(unaff.fs.wd)
head(unaff.fs.wd,2)

fwrite(unaff.fs.wd, "/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/128in_wd.frameshift_unaff.bed",sep="\t")

rm not in wd for frameshift of aff

1
2
3
4
5
6
7
8
9
10
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel")
aff.fs = fread("458frameshift_623indel_affected.cd.auto.no_pli_rm.allele.txt")
aff.fs$chr = paste0("chr",aff.fs$chr)
aff.fs$start = aff.fs$pos - 1
nrow(aff.fs)
aff.fs2 = unique(aff.fs[,c(1,6,2)])

nrow(aff.fs2)
head(aff.fs2,2)
fwrite(aff.fs2,"458frameshift_aff_unique.bed",sep="\t",col.names=F)

458

458

A data.table: 2 × 3
chrstartpos
<chr><dbl><int>
chr1 110584429110584430
chr15 69080206 69080207
1
2
3
4
cd /storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel
bedtools subtract -a 458frameshift_aff_unique.bed -b \
/storage11_7T/fuy/TADA-A/db/MS_data/windows_partition/uq.mg.st.17478gene_hg19.87.auto.exon.wd.bed > not_in_wd.458frameshift_aff_unique.bed
wc -l not_in_wd.458frameshift_aff_unique.bed
2 not_in_wd.458frameshift_aff_unique.bed
1
2
3
4
5
6
7
8
9
10
notin = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/not_in_wd.458frameshift_aff_unique.bed")
notin$mg = paste(notin$V1,notin$V3,sep="_")
nrow(notin)
aff.fs$mg = paste(aff.fs$chr,aff.fs$pos,sep="_")
nrow(aff.fs)
aff.fs.wd = aff.fs[!(aff.fs$mg %in% notin$mg),]
nrow(aff.fs.wd)
head(aff.fs.wd,2)

fwrite(aff.fs.wd, "/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/456in_wd.frameshift_aff.bed",sep="\t")

2

458

456

A data.table: 2 × 7
chrposrefaltGENE_NAMEstartmg
<chr><int><chr><chr><chr><dbl><chr>
chr1 110584430GC GSTRIP1110584429chr1_110584430
chr15 69080207CTTCGAGTTTGCCTTCACANP32A 69080206chr15_69080207

EM

1
2
df2 = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/unaffected/nonfs_unaff_mutrate")[,c(1,4,7)]
head(df2,2)
A data.table: 2 × 3
genenamenonfs_countnonfs_rate_2N
<chr><int><dbl>
A1BG00.002328933656623770158400
A1CF00.002619080598964510127324
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
setwd("/storage11_7T/fuy/TADA-A/cell_WES/DNM/indel_rate/")

options(scipen=200)
options(digits=22)


genelst = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-21_selected14_estim_pi_all_genes.rds")[,1:2]
mg2 = merge(genelst,df2,by="genename")

fs = fread("/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/456in_wd.frameshift_aff.bed")[,c(1,2,5)]
fs$mg = paste(fs$chr,fs$end,sep="_")

fs_count = c()
for(i in unique(fs$GENE_NAME)){

fs_count = c(fs_count, nrow(fs[fs$GENE_NAME == i,]))
}

df.fs = data.frame(genename = unique(fs$GENE_NAME),fs_count = fs_count )
fwrite(df.fs,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/count_456in_wd_frameshift_gene",sep="\t")

mg2 = merge(mg2,df.fs,by="genename",all=T)
mg2[is.na(mg2$fs_count),]$fs_count = 0
nrow(mg2)

17478

1
sum(mg2$fs_count)

456

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
gama= 20 ### initial gamma
N = 2179 ### sample size
fs_fraction = 128/42 ### fraction of frameshift (count framshifts in unaffected / count indels in unaffected)

repeat{
mydata3 = mg2$prior

mydata4 = mg2$fs_count

mydata5 = fs_fraction * mg2$nonfs_rate_2N

p=(mydata3*dpois(mydata4,gama*mydata5,log=FALSE))/((1-mydata3)*dpois(mydata4,mydata5,log=FALSE)+
mydata3*dpois(mydata4,gama*mydata5,log=FALSE)) #代入公式

tes=gama
gama=sum(p*mydata4)/sum(p*mydata5)
if(abs(gama-tes)/tes < 1e-8) break

}

q0 = 1-p ### risk probability of non-risk genes

mg2$post = p
mg2$q0 = q0

mg2 = mg2[order(mg2$q0),]

FDR = c()
for (i in 1:nrow(mg2)) FDR[i] <- sum(mg2$q0[1:i]) / i
mg2$FDR = FDR

mg3 = mg2[mg2$FDR<0.1,]
nrow(mg3)

gama
log(gama)

172

22.4947944826454

3.1132839261165

1
2
3
annota = c("baseline","SNV","SNV+Indel")
num_risk_genes = c(54,81,172)
data.frame(annota,num_risk_genes)
A data.frame: 3 × 2
annotanum_risk_genes
<fct><dbl>
baseline 54
SNV 81
SNV+Indel172
1
fwrite(mg3,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/res/2021-05-24_indel+snv_risk_ASD_genes.txt",sep="\t")
1
2
3
baseline = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-24_selected14_PP_estim_pi_risk_genes.rds")
nrow(baseline)
fwrite(baseline[,1],"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/2021-05-24_snv_selected14_PP_risk_ASD_genes.txt",sep="\t")

54

1
2
3
4
novel = mg3[!(mg3$genename %in% baseline$genename),]
fwrite(novel,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/res/2021-05-24_novel_indel+snv-baseline_risk_ASD_genes.txt",sep="\t")
fwrite(novel[,1],"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/res/2021-05-24_novel_indel+snv-baseline_risk_ASD_genes.lst",sep="\t")
nrow(novel)

126

1
2
snv$idx = 1:nrow(snv)
snv[30:40]
A data.table: 11 × 5
genenamepriorq0FDRidx
<chr><dbl><dbl><dbl><int>
GFAP 0.94482475540426447135900.055175244595735528641000.0227916257075449456281930
PTK7 0.94472063733701094889740.055279362662989051102610.0238396172222366913018331
UGT1A3 0.94021296896729023018220.059787031032709769817760.0249629739038139744633532
UGT1A4 0.94001027114125512706980.059989728858744872930230.0260243907206300639778433
UGT1A5 0.93976363480025304486530.060236365199746955134690.0270306252641334988917634
UGT1A7 0.93969000013936188064890.060309999860638119351110.0279814645383193449923535
UGT1A100.93958289496167457510720.060417105038325424892780.0288824545522084058513136
UGT1A9 0.93957678958663903845410.060423210413360961545950.0297349074133206395242937
UGT1A1 0.93925217678159012013590.060747823218409879864050.0305510367766124585342538
UGT1A6 0.93777350256242053561100.062226497437579464389050.0313632280756116141318139
PRKAR1B0.92992403882167906647990.070075961178320933520070.0323310464031793393102640
1
2


1
2
3
4
5
6
7
snv = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2021-05-21_selected14_estim_pi_risk_genes.rds")
nrow(snv)
nrow(mg3)
indel_snv = mg3[!(mg3$genename %in% snv$genename),]
fwrite(indel_snv,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/res/2021-05-24_93novel_indel_risk_ASD_genes.txt",sep="\t")
fwrite(indel_snv[,1],"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/indel/res/2021-05-24_93novel_indel_risk_ASD_genes.lst",sep="\t")
nrow(indel_snv)

81

172

93

1
2
3
4
snv_pp = snv[!(snv$genename %in% baseline$genename),]
# snv_pp
fwrite(snv_pp,"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/2021-05-24_33snv-baseline_risk_ASD_genes.txt",sep="\t")
fwrite(snv_pp[,1],"/storage11_7T/fuy/TADA-A/cell_WES/DNM/affected/2021-05-24_33snv-baseline_risk_ASD_genes.lst",sep="\t")
1
2