6166 DNM

  • 7131 DNM in coding windows
  • ALT should be SNV while REF could be whatever.
  • duplicated
1
2
3
4
f = fread("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors.txt")
colnames(f)[2] = "z"
f$scaling_factor = f$z * (1441/1068)
fwrite(f[,c(1,3)],"/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors_1441_d_1068.txt",sep="\t")
1
2
library(tadaA)
setwd("/storage11_7T/fuy/TADA-A")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
system.time(compact_data <- TADA_A_read_info_by_chunks(
mut_files = "/storage11_7T/fuy/TADA-A/cell_WES/DNM/st.6166SNV.cd.window.cd_mis_pli_syn.7131DNM.allele.bed",

# c("db/MS_data/DNM/Jiang_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Kong_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Michaelson_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Yuen_NM2015_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Wu_cases_DNM_with_allele_info.txt"),

window_file = 'db/MS_data/windows_partition/cd_windows_with_div_score.bed',

# mutrate_scaling_files = c("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors_1441_d_1692.txt"),
# mutrate_scaling_files = c("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors.txt"),
mutrate_scaling_files = c("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors_1441_d_1068.txt"),

# sample_sizes = 6430,
sample_sizes = 4059,

# mutrate_scaling_files = c("db/MS_data/results/Jiang_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Kong_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Michaelson_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Yuen_NM2015_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Wu_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt"),

# sample_sizes = c(32,78,10,162,32),


gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",

nonAS_noncoding_annotations = c(
"/storage11_7T/fuy/TADA-A/annotation/ccr/ccrs.allchrom.gt90.bed"),
#"/storage11_7T/fuy/TADA-A/annotation/ccr/chr_ccr_gt95_syn_rm.bed"),

AS_noncoding_annotations = list(

# c("/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_A.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_C.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_G.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_T.bed")

c("/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli995_altA.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli995_altC.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli995_altG.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli995_altT.bed"),

c("/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli05-995_altA.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli05-995_altC.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli05-995_altG.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli05-995_altT.bed") ,

c("/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli0-05_altA.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli0-05_altC.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli0-05_altG.bed",
"/storage11_7T/fuy/TADA-A/annotation/pLI/annovar_cd_window_pli0-05_altT.bed") ,

c("/storage11_7T/fuy/TADA-A/db/MS_data/annota/spidex_public_noncommercial_v1_0.tab_alt_A_lower10pct.bed",
"/storage11_7T/fuy/TADA-A/db/MS_data/annota/spidex_public_noncommercial_v1_0.tab_alt_C_lower10pct.bed",
"/storage11_7T/fuy/TADA-A/db/MS_data/annota/spidex_public_noncommercial_v1_0.tab_alt_G_lower10pct.bed",
"/storage11_7T/fuy/TADA-A/db/MS_data/annota/spidex_public_noncommercial_v1_0.tab_alt_T_lower10pct.bed") ,

c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/A.cd.MPC2.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/C.cd.MPC2.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/G.cd.MPC2.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/T.cd.MPC2.annovar.bed") ,

c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/A.cd.MPC12.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/C.cd.MPC12.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/G.cd.MPC12.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/T.cd.MPC12.annovar.bed") ,

c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/A.cd.MPC01.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/C.cd.MPC01.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/G.cd.MPC01.annovar.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/annovar/T.cd.MPC01.annovar.bed")

),

report_proportion = 18665/18665,
#chunk_partition_num =1,
chunk = 2,
node_n = 2,
mutrate_ref_files = c("db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_A.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_C.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_G.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_T.mutrate.bw")

# mutrate_ref_files = c("/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_A.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_C.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_G.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_T.uq.bedGraph.sort.bw" )


))

saveRDS(compact_data,paste0("/storage11_7T/fuy/TADA-A/cell_WES/DNM/",Sys.Date(),"_SNV_6166_4059_fam_ccr_pli_splidex_MPC_sf_1441_d_1068_compact.rds"))
 user    system   elapsed 
3.680     4.317 12555.745 
1
compact_data = readRDS("/storage11_7T/fuy/TADA-A/cell_WES/DNM/2020-11-13_SNV_6166_4059_fam_ccr_pli_splidex_MPC_sf_1441_d_1068_compact.rds")
  • obs
1
2
3
4
5
6
7
8
9
10
11
12
13
s = 0
for(j in 1:length(compact_data$base_info)){

lg = length(compact_data$base_info[[j]])
for(i in 1:lg){
if(compact_data$base_info[[j]][[i]]$feature_vector[1] == 1){
s = s + compact_data$base_info[[j]][[i]]$sum_mut_count
}
}
}
s

# sum_mut_rate

280

  • exp
1
2
3
4
5
6
7
8
9
10
11
s = 0
for(j in 1:length(compact_data$base_info)){

lg = length(compact_data$base_info[[j]])
for(i in 1:lg){
if(compact_data$base_info[[j]][[i]]$feature_vector[1] == 1){
s = s + compact_data$base_info[[j]][[i]]$sum_mut_rate
}
}
}
s

181.259045494564

MPC0-2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 7:8){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn
1
dfn
A data.frame: 2 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
1.4654351.2002041.730666
1.2471261.1075931.386660

MPC2-calibrated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 1){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn

syn-calibrated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 1){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn
[1] "Read in gene prior file. Time consumed: 0.003s."
[1] "Finished optimization. Time consumed: 543.98.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"



    user   system  elapsed 
3646.580    4.260  543.993 
A data.frame: 1 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
0.1557098-2.7656213.077041
1
exp(0.155)

1.16765796110512

syn-non-calibrated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 1){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn
[1] "Read in gene prior file. Time consumed: 0.004s."
[1] "Finished optimization. Time consumed: 339.683.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"



   user  system elapsed 
 2279.4     3.3   339.7 
A data.frame: 1 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
1.7101321.5379641.882301
1
2
3
4
RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(1,2,3,4,5), 
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)
[1] "Read in gene prior file. Time consumed: 0.004s."
[1] "Finished optimization. Time consumed: 244.262.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"



    user   system  elapsed 
1578.043    1.948  244.275 
A data.frame: 1 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
2.3475492.1134992.581599
[1] "Read in gene prior file. Time consumed: 0.005s."
[1] "Finished optimization. Time consumed: 8611.198.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"
1
2
3
4
RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(1,2,3,4,5,6,7,8), 
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)
[1] "Read in gene prior file. Time consumed: 0.108s."
[1] "Finished optimization. Time consumed: 36277.807.s"


Warning message in sqrt(c(diag(fisher_info))):
“NaNs produced”


[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"
1
RR$rr_report
A data.frame: 8 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
0.31703160.056388710.5776745
2.54554132.210312952.8807697
1.58011720.885081912.2751525
-4.8069632 NaN NaN
0.56667290.309492780.8238530
2.02106321.759706352.2824201
1.05223790.728224841.3762510
1.02625450.866854501.1856545
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
options(scipen=200)
tm = proc.time()
BF = TADA_A_get_BFs(data = compact_data$base_info,
selected_annotations = c(1,2,3,4,5,6,7,8),
rr = RR$rr_report$logRR ,
# additional_BF_file = c("/storage11_7T/fuy/TADA-A/annotation/cd_BF/Example_gene_coding_BF.txt"),
additional_BF_file = c("/storage11_7T/fuy/TADA-A/annotation/cd_BF/uniform_cd_BFs.txt"),
TADA_p0 = 0.94)

g_BF = BF$gene_BF_table
g_BF = g_BF[order(g_BF$FDR_all),]
g_BF2 = g_BF[g_BF$FDR_all <= 0.1,]
proc.time() - tm
nrow(g_BF2)

g_BF2
[1] "Read in additional BF file /storage11_7T/fuy/TADA-A/annotation/cd_BF/uniform_cd_BFs.txt."
[1] "Flagged genes that don't have any bases with any informative annotation."
[1] "Got genenames without bases that have informative rr features."
[1] "Got logBF_noncoding."
[1] "Added coding and non-coding logBF. Time consumed: 2.516s."
[1] "Finished!"



   user  system elapsed 
 13.426   0.000  12.597 

42

A data.table: 42 × 9
genenamelogBF_noncodinglogBF_codinglogBF_allBF_noncodingBF_codingBF_allFDR_codingFDR_all
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
SCN2A 39.453126039.453126136230617540700944.000001136230617540700944.000000.940.0000000000000001110223
CHD8 18.177104018.177104 78381916.101801 78381916.101800.940.0000000999379979882420
SLC6A1 16.738907016.738907 18604365.271311 18604365.271310.940.0000003473238534557055
SYNGAP1 16.553519016.553519 15456179.562421 15456179.562420.940.0000005138972116736085
FOXP1 12.746716012.746716 343422.115171 343422.115170.940.0000095345558388526541
ARID1B 12.240544012.240544 207014.480561 207014.480560.940.0000205576897087667021
GRIN2B 10.840328010.840328 51038.094551 51038.094550.940.0000614588857867914840
CHD2 10.603100010.603100 40259.432301 40259.432300.940.0001024004486186447238
ANK2 9.1387770 9.138777 9309.370321 9309.370320.940.0002776964908019177356
SETD5 8.2508860 8.250886 3831.020341 3831.020340.940.0006572037176549195159
DEAF1 7.3383930 7.338393 1538.237591 1538.237590.940.0015140153144113704855
TRIP12 6.8572450 6.857245 950.743861 950.743860.940.0027387800415773488893
DYNC1H1 6.7107680 6.710768 821.201441 821.201440.940.0039681502175744108943
RELN 6.6552500 6.655250 776.851731 776.851730.940.0050967256116610558728
KCNQ3 6.5015310 6.501531 666.161071 666.161070.940.0062887742805260915188
NRXN1 5.8345340 5.834534 341.905491 341.905490.940.0086341009253602732376
PPP2R5D 5.6437660 5.643766 282.524701 282.524700.940.0112167401238590144080
ATP1B1 5.6262650 5.626265 277.623261 277.623260.940.0135611987591693714666
WDFY3 5.2512900 5.251290 190.812281 190.812280.940.0168408920802590039711
BRD4 5.2022780 5.202278 181.685571 181.685570.940.0199680618038671972569
ADCY5 5.0041340 5.004134 149.027931 149.027930.940.0235469905335579827999
NAA15 4.9439690 4.943969 140.326061 140.326060.940.0270417652801548946684
MYT1L 4.8406050 4.840605 126.545921 126.545920.940.0306557631241672033651
DPYSL2 4.7640160 4.764016 117.215741 117.215740.940.0342908867757513863284
ADNP 4.7624960 4.762496 117.037721 117.037720.940.0376415267434866696039
TRAF7 4.6547180 4.654718 105.079561 105.079560.940.0411841104891946183097
STXBP1 4.6315230 4.631523 102.670331 102.670330.940.0445621163077793355778
SUV420H1 4.4942500 4.494250 89.501011 89.501010.940.0482909140365002306772
GRIA2 4.4279340 4.427934 83.758181 83.758180.940.0520592604193321623063
RORB 4.3623390 4.362339 78.440371 78.440370.940.0558731884686136937024
AP2S1 4.3556200 4.355620 77.915121 77.915120.940.0594711985512391336228
ASH1L 4.3142510 4.314251 74.757631 74.757630.940.0630270133653622055547
NCOA1 4.2199910 4.219991 68.032881 68.032880.940.0667891474762082881789
GABRB2 4.1914770 4.191477 66.120371 66.120370.940.0704587136889861104239
ATP2B4 4.1409490 4.140949 62.862461 62.862460.940.0741456464692032135577
NF1 4.1380420 4.138042 62.679941 62.679940.940.0776406592740676582132
MKX 4.1071800 4.107180 60.775091 60.775090.940.0810814268997331194511
CELF4 4.0952960 4.095296 60.057111 60.057110.940.0843922391864393717231
NOL6 4.0278490 4.027849 56.140041 56.140040.940.0878226519139242434164
MYOCD 3.9843110 3.984311 53.748251 53.748250.940.0912694843664599575472
GNAI1 3.8743150 3.874315 48.149721 48.149720.940.0950311066772963064508
CNTNAP4 3.8427970 3.842797 46.655781 46.655780.940.0987537184562236741714
1
2