1
2
library(tadaA)
setwd("/storage11_7T/fuy/TADA-A")
1
2
3
4
5
6
7
TADA_A_adjust_mutrate(window_file = 'db/MS_data/windows_partition/cd_windows_with_div_score.bed',
mut_file = '/storage11_7T/fuy/TADA-A/TADA-A/lib/2.bed',
genes = "all",
sample_size = 6430,
mutrate_mode = "regular",
scaling_file_name = "/storage11_7T/fuy/TADA-A/TADA-A/lib/2bed_sf.txt",
scale_features = c("GC_content", "div_score"))
[1] "Made a tmp folder for tmp files."
[1] "Finished calculating mutation count within per genomic window. Time consumed: 1.955s"
[1] "Finished fitting mutation rate model and calculating scaling factors. Time consumed: 3.976s"
[1] "Removed temporary files."
A matrix: 3 × 4 of type dbl
EstimateStd. Errorz valuePr(>|z|)
(Intercept) 0.0018433270.01345905 0.13695820.89106386
GC_content 0.0034250120.01363202 0.25124750.80162276
div_score-0.0223681650.01276638-1.75211450.07975413
1
2
3
4
f = fread("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors.txt")
colnames(f)[2] = "z"
f$scaling_factor = f$z * (1465/1692)
fwrite(f[,c(1,3)],"/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors_1465_d_1692.txt",sep="\t")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
system.time(compact_data <- TADA_A_read_info_by_chunks(
mut_files = "/storage11_7T/fuy/TADA-A/cell_WES/DNM/6166SNV.cd.window.cd_mis_pli_syn.7131DNM.allele.bed",

# c("db/MS_data/DNM/Jiang_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Kong_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Michaelson_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Yuen_NM2015_cases_DNM_with_allele_info.txt",
# "db/MS_data/DNM/Wu_cases_DNM_with_allele_info.txt"),

window_file = 'db/MS_data/windows_partition/cd_windows_with_div_score.bed',

mutrate_scaling_files = c("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors_1465_d_1692.txt"),
# mutrate_scaling_files = c("/storage11_7T/fuy/TADA-A/cell_WES/cd_uniform_scaling_factors.txt"),

# sample_sizes = 4265,
sample_sizes = 6430,

# mutrate_scaling_files = c("db/MS_data/results/Jiang_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Kong_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Michaelson_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Yuen_NM2015_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt",
# "db/MS_data/results/Wu_windows_mutrate_with_div_score_scaling_file_for_test_DNM.txt"),

# sample_sizes = c(32,78,10,162,32),


gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",

nonAS_noncoding_annotations = NA, #c(
# "/storage11_7T/fuy/TADA-A/annotation/ccr/ccrs.allchrom.gt90.bed"),
#"/storage11_7T/fuy/TADA-A/annotation/ccr/chr_ccr_gt95_syn_rm.bed"),

AS_noncoding_annotations = list(
# c("/storage11_7T/fuy/TADA-A/annotation/pLI/pLI_nlt_995_gnomad.v2.1.1.all_lofs_snv_alt_A.bed",
# "/storage11_7T/fuy/TADA-A/annotation/pLI/pLI_nlt_995_gnomad.v2.1.1.all_lofs_snv_alt_C.bed",
# "/storage11_7T/fuy/TADA-A/annotation/pLI/pLI_nlt_995_gnomad.v2.1.1.all_lofs_snv_alt_G.bed",
# "/storage11_7T/fuy/TADA-A/annotation/pLI/pLI_nlt_995_gnomad.v2.1.1.all_lofs_snv_alt_T.bed"),


# c("/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_A.05-995.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_C.05-995.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_G.05-995.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_T.05-995.bed") ,

# c("/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_A.ls05.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_C.ls05.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_G.ls05.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/all_lof_alt_T.ls05.bed") ,

# c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/A.cd.annovar.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/C.cd.annovar.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/G.cd.annovar.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/T.cd.annovar.bed")

c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/new/ct.cd.window.MPC2.altA.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/new/ct.cd.window.MPC2.altC.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/new/ct.cd.window.MPC2.altG.bed",
"/storage11_7T/fuy/TADA-A/annotation/MPC_score/new/ct.cd.window.MPC2.altT.bed")

# c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/fordist_constraint_official_mpc_values_v2_MPC_gt2_altA.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/fordist_constraint_official_mpc_values_v2_MPC_gt2_altC.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/fordist_constraint_official_mpc_values_v2_MPC_gt2_altG.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/fordist_constraint_official_mpc_values_v2_MPC_gt2_altT.bed"),

# c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC12_alt_A.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC12_alt_C.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC12_alt_G.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC12_alt_T.bed"),

# c("/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC01_alt_A.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC01_alt_C.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC01_alt_G.bed",
# "/storage11_7T/fuy/TADA-A/annotation/MPC_score/chr.MPC01_alt_T.bed")

# c("/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_A.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_C.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_G.bed",
# "/storage11_7T/fuy/TADA-A/annotation/driverMAPS/cd.window.all_cd_syn_alt_T.bed")
),

report_proportion = 18665/18665,
#chunk_partition_num =1,
chunk = 2,
node_n = 2,
mutrate_ref_files = c("db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_A.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_C.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_G.mutrate.bw",
"db/MS_data/mutrate/Example_windows_extended_1bp_for_getting_base_level_mutrate.bed.fasta.tri.alt_T.mutrate.bw")

# mutrate_ref_files = c("/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_A.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_C.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_G.uq.bedGraph.sort.bw" ,
# "/storage11_7T/data.for.yuwen/new_mutrate/window.hg19.genome.tri2.alt_T.uq.bedGraph.sort.bw" )


))

saveRDS(compact_data,paste0("/storage11_7T/fuy/TADA-A/annotation/MPC_score/new/",Sys.Date(),"_SNV_6166_sf1465_d_1692_new_MPC2_compact.rds"))
 user   system  elapsed 
1.761    0.420 1231.811 

un-calibrated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 1){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn
[1] "Read in gene prior file. Time consumed: 0.004s."
[1] "Finished optimization. Time consumed: 85.106.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"



   user  system elapsed 
578.210   0.658  85.120 
A data.frame: 1 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
2.3003812.0630522.537711
  • old version overlapped with annovar
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
tm = proc.time()
dfn = c()
for(i in 1){

RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = c(i),
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)

dfn = rbind(dfn,RR$rr_report)
}
proc.time() - tm
# saveRDS(RR,paste0("/storage11_7T/fuy/TADA-A/annotation/results/",Sys.Date(),"_wes_deepsea_brain_RR.rds"))

dfn
[1] "Read in gene prior file. Time consumed: 0.005s."
[1] "Finished optimization. Time consumed: 84.904.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"



   user  system elapsed 
575.445   0.815  84.924 
A data.frame: 1 × 3
logRRlower_boundupper_bound
<dbl><dbl><dbl>
2.3270582.0915252.562591
1
log(22)

3.09104245335832

1
2
3
4
RR = TADA_A_RR_estimate(data = compact_data$base_info, selected_annotations = seq(1,6), 
# gene_prior_file = "db/MS_data/prior/Example_gene_prior.txt",
gene_prior_file = "db/MS_data/prior/uniform_gene_prior.txt",
optimization_iteration = 2000)
[1] "Read in gene prior file. Time consumed: 0.004s."
[1] "Finished optimization. Time consumed: 10344.042.s"
[1] "Got confidence intervals of RR estimates."
[1] "Finished RR estimation!"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
options(scipen=200)
tm = proc.time()
BF = TADA_A_get_BFs(data = compact_data$base_info,
selected_annotations = seq(1),
rr = RR$rr_report$logRR ,
# additional_BF_file = c("/storage11_7T/fuy/TADA-A/annotation/cd_BF/Example_gene_coding_BF.txt"),
additional_BF_file = c("/storage11_7T/fuy/TADA-A/annotation/cd_BF/uniform_cd_BFs.txt"),
TADA_p0 = 0.94)

g_BF = BF$gene_BF_table
g_BF = g_BF[order(g_BF$FDR_all),]
g_BF2 = g_BF[g_BF$FDR_all <= 0.1,]
proc.time() - tm
nrow(g_BF2)

g_BF2
[1] "Read in additional BF file /storage11_7T/fuy/TADA-A/annotation/cd_BF/uniform_cd_BFs.txt."
[1] "Flagged genes that don't have any bases with any informative annotation."
[1] "Got genenames without bases that have informative rr features."
[1] "Got logBF_noncoding."
[1] "Added coding and non-coding logBF. Time consumed: 2.76s."
[1] "Finished!"



   user  system elapsed 
  5.021   0.001   4.311 

13

A data.table: 13 × 9
genenamelogBF_noncodinglogBF_codinglogBF_allBF_noncodingBF_codingBF_allFDR_codingFDR_all
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
SCN2A 22.885877022.8858778693807382.1178618693807382.117860.940.000000001802049
SLC6A115.102642015.102642 3622381.993561 3622381.993560.940.000002163372975
CHD8 8.8587630 8.858763 7035.773881 7035.773880.940.000742031662654
KCNQ3 8.1589500 8.158950 3494.515161 3494.515160.940.001672325394328
FOXP1 6.5930360 6.593036 729.993681 729.993680.940.005539952235090
DPYSL2 6.0995190 6.099519 445.643461 445.643460.940.010276834526270
STXBP1 5.7097400 5.709740 301.792441 301.792440.940.015858742102373
DEAF1 4.5598690 4.559869 95.570981 95.570980.940.031481350902192
KDM5B 4.4620010 4.462001 86.660711 86.660710.940.044994908495381
MKX 4.3055380 4.305538 74.109071 74.109070.940.057946308659227
GNAI1 4.2594650 4.259465 70.772081 70.772080.940.069155360478303
AP2S1 4.1705140 4.170514 64.748751 64.748750.940.079627554612711
ELAVL3 3.9159920 3.915992 50.198841 50.198840.940.091799160133585
1
2
3
4
5
6
7
8
l = length(compact_data$base_info)
l

s = 0
for(i in 1:l){
s = s + compact_data$base_info[[i]]$`1`$sum_mut_rate
}
s

17450

190.511983813861

1
2
3
4
5
6
7
8
9
l = length(compact_data$base_info)
l

s = 0
for(i in 1:l){
s = s + compact_data$base_info[[i]]$`1`$sum_mut_count

}
s

17450

282

1
2