This vignette provides a step-by-step guide to simulating clonal-level cancer genomes using the cancerSimCraft simulator. The simulation involves generating clonal and subclonal mutations, constructing synthetic genomes, and performing checks to ensure the accuracy of the simulation.
Load the necessary libraries and the simulation data.
# Load libraries
library(igraph)
library(Biostrings)
library(tictoc)
library(cancerSimCraft)
# Load simulation settings
load(file = "./tutorials_data/advanced_4_simulation_setting.RData")
load(file = "./large_tutorials_data/advanced_5_backbone_genome_with_snp.RData")
The update_sim_from_event_table function updates the
simulation based on the provided event table. This step generates
updated segment lists and chromosome lengths for each node in the
tree.
sim_updates <- update_sim_from_event_table(
tree = chisel_tree,
event_table = transformed_event_table,
initial_chr_arm_seg_list = initial_seg_list,
initial_sub_seg_list = initial_sub_seg_list,
initial_chr_lengths = initial_chr_lengths)
## [1] "WGD updates: No remaining segments to process for paternal chr13 in I_MRCA."
Perform a sanity check to ensure that the segment lengths match the expected chromosome lengths for each clone.
clone_check <- list()
for(clone in clone_names){
clone_check[[clone]] <- segments_sanity_check(
segment_list = sim_updates$all_node_segments[[clone]],
chr_lengths = sim_updates$all_node_chr_lengths[[clone]])
}
# An empty list indicates that all segment lengths match expectations.
Calculate the number of single nucleotide variants (SNVs) for each
edge in the tree using the calc_clone_snv_num function.
edge_events_num <- sapply(sim_updates$all_node_events,nrow)
clone_genome_length <- sapply(sim_updates$all_node_chr_lengths, function(x) return(sum(unlist(x))))
mutation_rate = 10^-8
edge_snv_num <- vector()
for(edge in names(edge_events_num)){
time <- edge_events_num[edge]
parent_node <- strsplit(edge, split = "_")[[1]][1]
edge_snv_num[edge] <- calc_clone_snv_num(
time = time,
genome_length = clone_genome_length[parent_node],
mutation_rate = mutation_rate)
}
# Introduce random variation of ±10%
varied_edge_snv_num <- sapply(edge_snv_num, function(snv) {
round(snv * runif(1, min = 0.85, max = 1.15))
})
In this step, we simulate the positions and nucleotide changes of clonal mutations in the genome. By default, the mutations are randomly simulated, which means running this code multiple times will produce different results. However, for reproducibility or to skip this step, we provide a pre-computed example dataset of clonal mutations.
If you want to generate random positions and the nucleotide changes for clonal mutations, run the following code:
# Simulate clonal mutation positions
clonal_mut_locs <- sim_clonal_mutation_pos(
tree = chisel_tree,
chr_lengths = sim_updates$all_node_chr_lengths,
mutation_number = varied_edge_snv_num
)
# Generate the K2P transition matrix
alpha <- 1.21
beta <- 1
K2P_transition_matrix <- generate_K2P_matrix(alpha, beta)
# Simulation clonal mutation nucleotides
clonal_mut_nt <- sim_clonal_mutation_nt(
genome_sequence = genome_with_snp$sim_genome,
seg_list = sim_updates$all_node_segments,
mutation_info = clonal_mut_locs,
nt_transition_matrix = K2P_transition_matrix,
tree = chisel_tree
)
If you want to use a pre-computed example dataset for reproducibility or to skip the random generation step, load the example dataset as follows:
# Load the pre-computed example dataset
clonal_mut_nt <- read.table("./tutorials_data/clonal_mut_nt.tsv", sep = "\t", header = TRUE)
The pre-computed dataset (clonal_mut_nt.tsv) contains the information of clonal mutations for all edges in the tree. It was generated using the same parameters as the random generation step, ensuring reproducibility across different runs.
Generate synthetic genomes for all clones in the dataset.
The genome_with_snp object
(advanced_5_backbone_genome_with_snp) is a pre-processed
data structure that contains a synthetic genome sequence with embedded
single nucleotide polymorphisms (SNPs). It serves as the starting point
for simulating clonal and subclonal mutations in the cancer genome.
For the purpose of reducing computation time and memory usage, the
genome_with_snp object only includes chromosomes 1, 2, and
3. For users interested in performing whole-genome simulations, the
chr_indices in
advanced_5_backbone_genome_with_snp can be modified to
include all 22 autosomes (or other desired chromosomes).
Use synth_tree_genomes to synthesize all clone genomes
at once and stores them in a single R list object. It is simpler and
more convenient for downstream analysis but requires more memory (100 GB
or more for whole genome level analysis with all 22 chromosomes for this
simulation setting).
# Generate synthetic genomes for all nodes in the tree
tic()
all_node_genome <- synth_tree_genomes(
tree = chisel_tree,
root_genome = genome_with_snp$sim_genome,
seg_list = sim_updates$all_node_segments,
mut_table = clonal_mut_nt
)
time_taken <- toc(log = TRUE)
print(time_taken)
# Check chr_lengths
all_chr_length_check <- list()
for(clone in clone_names){
all_chr_length_check[[clone]] <- check_genome_chr_length(
clone_genome = all_node_genome[[clone]],
expected_chr_lengths = sim_updates$all_node_chr_lengths[[clone]]
)
}
# Check inserted mutations
clone_mutations <- acquire_clone_mutations(
mutation_table = clonal_mut_nt,
tree = chisel_tree
)
# remember to exclude the 'I' clone (normal genome)
all_mutation_check <- list()
for(clone in clone_names[-1]){
all_mutation_check[[clone]] <- check_genome_mutations(
clone_genome = all_node_genome[[clone]],
clone_mutation_table = clone_mutations[[clone]],
clone_seg_list = sim_updates$all_node_segments[[clone]])
}
# Check loss segments
all_loss_segment_check <- list()
for(clone in clone_names[-1]){
all_loss_segment_check[[clone]] <- check_loss_segments(
clone_genome = all_node_genome[[clone]],
clone_segments = sim_updates$all_node_segments[[clone]])
}
Process one clone at a time using synth_clone_genome
function. Following codes synthesize genomes, check the simulation
correctness, write the genome to disk, and clear memory before moving to
the next clone.
# Get the root node of the tree
root_name <- names(V(chisel_tree))[degree(chisel_tree, mode = "in") == 0]
# Write the backbone genome into a fasta file format
for(haplotype in c("maternal", "paternal")){
writing_genome <- DNAStringSet(unlist(genome_with_snp$sim_genome[[haplotype]]))
writeXStringSet(
writing_genome,
paste0("./large_tutorials_data/clone_", root_name, "_", haplotype, ".fa")
)
rm(writing_genome)
}
# Freeing Up Memory
rm(genome_with_snp)
rm(ref_genome)
rm(TN28N_snp_list)
rm(TN28N_vcf_list)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3484926 186.2 9568668 511.1 NA 7580140 404.9
## Vcells 7991166 61.0 315482264 2407.0 16384 341807735 2607.8
# Initialize a list to store genome
all_node_genomes <- list()
# Get the order of nodes in the tree using depth-first search (DFS)
dfs_order <- dfs(chisel_tree, root = root_name, mode = "out")$order
# Generate clone mutations (only needs to be done once)
clone_mutations <- acquire_clone_mutations(
mutation_table = clonal_mut_nt,
tree = chisel_tree
)
# Initialize lists to store check results
all_chr_length_check <- list()
all_mutation_check <- list()
all_loss_segment_check <- list()
# Process each clone one at a time
for (i in 2:length(dfs_order)) {
child_node_index <- dfs_order[i]
parent_node_index <- get.adjlist(chisel_tree, mode = "in")[[child_node_index]]
parent_node <- V(chisel_tree)[parent_node_index]$name
child_node <- V(chisel_tree)[child_node_index]$name
# If the parent genome is not in memory, read it from disk
if (is.null(all_node_genomes[[parent_node]])) {
parent_genome <- list()
for (haplotype in c("maternal", "paternal")) {
parent_genome[[haplotype]] <- readDNAStringSet(
paste0("./large_tutorials_data/clone_", parent_node, "_", haplotype, ".fa")
)
}
all_node_genomes[[parent_node]] <- parent_genome
rm(parent_genome)
gc()
}
# Synthesize the genome for the current clone
all_node_genomes[[child_node]] <- synth_clone_genome(
target_clone = child_node,
nearest_genome = all_node_genomes[[parent_node]],
nearest_clone = parent_node,
tree = chisel_tree,
seg_list = sim_updates$all_node_segments,
mut_table = clonal_mut_nt,
verbose = FALSE
)
print(paste0("Synthesize of ", child_node, " genome is finished!"))
if (child_node != "I") { # Exclude the normal genome
# Perform sanity checks for the current clone
all_chr_length_check[[child_node]] <- check_genome_chr_length(
clone_genome = all_node_genomes[[child_node]],
expected_chr_lengths = sim_updates$all_node_chr_lengths[[child_node]]
)
all_mutation_check[[child_node]] <- check_genome_mutations(
clone_genome = all_node_genomes[[child_node]],
clone_mutation_table = clone_mutations[[child_node]],
clone_seg_list = sim_updates$all_node_segments[[child_node]]
)
all_loss_segment_check[[child_node]] <- check_loss_segments(
clone_genome = all_node_genomes[[child_node]],
clone_segments = sim_updates$all_node_segments[[child_node]],
verbose = FALSE
)
}
# Write the genome to disk
for (haplotype in c("maternal", "paternal")) {
writing_genome <- DNAStringSet(unlist(all_node_genomes[[child_node]][[haplotype]]))
writeXStringSet(
writing_genome,
paste0("./large_tutorials_data/clone_", child_node, "_", haplotype, ".fa")
)
rm(writing_genome)
}
# Clear memory
# Get the names of the children of the parent node
children_of_parent <- names(V(chisel_tree))[get.adjlist(chisel_tree, mode = "out")[[parent_node_index]]]
# Check if any of the remaining nodes are children of the parent node
if (i < length(dfs_order)) { # Ensure (i + 1) is within bounds
if (!any(V(chisel_tree)[dfs_order[(i + 1):length(dfs_order)]]$name %in% children_of_parent)) {
all_node_genomes[[parent_node]] <- NULL
cat(paste0("Cleared memory for parent clone ", parent_node, ".\n"))
}
} else {
# If i == length(dfs_order), there are no remaining nodes to process
all_node_genomes[[parent_node]] <- NULL
cat(paste0("Cleared memory for parent clone ", parent_node, " (no remaining nodes).\n"))
}
# Check if the child_node has no further children to synthesize
children_of_child <- names(V(chisel_tree))[get.adjlist(chisel_tree, mode = "out")[[child_node_index]]]
if (length(children_of_child) == 0) {
all_node_genomes[[child_node]] <- NULL
cat(paste0("Cleared memory for child clone ", child_node, " (no further children).\n"))
}
}
## [1] "Synthesize of MRCA genome is finished!"
## Cleared memory for parent clone I.
## [1] "Synthesize of II genome is finished!"
## Cleared memory for child clone II (no further children).
## [1] "Synthesize of IV genome is finished!"
## Cleared memory for parent clone MRCA.
## [1] "Synthesize of V genome is finished!"
## Cleared memory for parent clone IV.
## [1] "Synthesize of VI genome is finished!"
## Cleared memory for child clone VI (no further children).
## [1] "Synthesize of III genome is finished!"
## Cleared memory for parent clone V (no remaining nodes).
## Cleared memory for child clone III (no further children).
Save the final simulation data for future analysis.
save.image(file = "./large_tutorials_data/advanced_6_clonal_level_cancer_genome_simulation.Rmd")