diff --git a/DESCRIPTION b/DESCRIPTION index cedcb51..df8713d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,9 @@ Imports: utils, gh, curl, - cli + cli, + magrittr, + bnpsd Suggests: devtools, ggplot2, @@ -37,8 +39,7 @@ Suggests: rmarkdown, pandoc, dplyr, - rlang, - magrittr + rlang VignetteBuilder: knitr Config/Needs/website: rmarkdown Remotes: diff --git a/NAMESPACE b/NAMESPACE index 17a3902..b3d5959 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,20 +1,25 @@ # Generated by roxygen2: do not edit by hand -export(tp_TreePPL_json) export(tp_compile) export(tp_data) export(tp_fp_fetch) +export(tp_json_to_phylo) export(tp_list) +export(tp_map_tree) export(tp_model) export(tp_model_names) export(tp_parse) export(tp_parse_host_rep) -export(tp_phylo_2_TreePPL) +export(tp_phylo_to_tpjson) export(tp_run) +export(tp_smc_convergence) export(tp_stored_compiled) export(tp_stored_data) export(tp_stored_model) export(tp_tempdir) export(tp_treeppl) +export(tp_treeppl_json) export(tp_write) +importFrom(bnpsd,tree_reorder) +importFrom(magrittr,"%>%") importFrom(methods,is) diff --git a/R/getter.R b/R/getter.R index 1367883..daa1961 100644 --- a/R/getter.R +++ b/R/getter.R @@ -66,10 +66,9 @@ tp_data <- function(data_input) { # If path exists, import data from file if (!is(res, "try-error") && res) { data <- tp_list(jsonlite::fromJSON(data_input)) - # If path doens't exist + # If path doesn't exist } else if (assertthat::is.string(data_input)) { - res <- - try(get(data_input, treepplr::tp_model_names()), silent = TRUE) + res <- try(get(data_input, treepplr::tp_model_names()), silent = TRUE) # data_input has the name of a known model if (!is(res, "try-error")) { data <- tp_list(find_file(res, "json")) diff --git a/R/post_treatment.R b/R/post_treatment.R index dc1c2a6..3637fde 100644 --- a/R/post_treatment.R +++ b/R/post_treatment.R @@ -5,29 +5,40 @@ #' #' @param treeppl_out a character vector giving the TreePPL json output #' produced by [tp_treeppl]. -#' @param n_runs a [base::integer] giving the number of runs (MCMC) or sweeps (SMC). #' -#' -#' @return A list (n = n_runs) of data frames with the output from inference -#' in TreePPL. +#' @return A data frame with the output from inference in TreePPL. #' @export -tp_parse <- function(treeppl_out, n_runs = 1) { - - if (n_runs == 1) { - treeppl_out <- list(treeppl_out) - } +tp_parse <- function(treeppl_out) { result_df <- list() - for (index in seq_along(treeppl_out)) { - result_df <- - rbind(result_df, data.frame(samples=unlist(treeppl_out[[1]]$samples), - log_weight=unlist(treeppl_out[[1]]$weights))) + for (i in seq_along(treeppl_out)) { + + samples_c <- unlist(treeppl_out[[i]]$samples) + log_weight_c <- unlist(treeppl_out[[i]]$weights) + + if(is.null(names(samples_c))){ + result_df <- rbind(result_df, + data.frame(run = i, + samples = samples_c, + log_weight = log_weight_c, + norm_const = treeppl_out[[i]]$normConst) + ) + } else { + result_df <- rbind(result_df, + data.frame(run = i, + parameter = names(samples_c), + samples = samples_c, + log_weight = log_weight_c, + norm_const = treeppl_out[[i]]$normConst) + ) + } } return(result_df) } + #' Parse TreePPL json output for host repertoire model #' #' @description @@ -37,17 +48,11 @@ tp_parse <- function(treeppl_out, n_runs = 1) { #' @param treeppl_out a character vector giving the TreePPL json output #' produced by [tp_treeppl]. #' -#' @param n_runs a [base::integer] giving the number of runs (MCMC) or sweeps (SMC). -#' -#' #' @return A list (n = n_runs) of data frames with the output from inference #' in TreePPL under the host repertoire evolution model. #' @export -tp_parse_host_rep <- function(treeppl_out, n_runs = 1) { - if (n_runs == 1) { - treeppl_out <- list(treeppl_out) - } +tp_parse_host_rep <- function(treeppl_out) { result_list <- list() @@ -82,7 +87,8 @@ tp_parse_host_rep <- function(treeppl_out, n_runs = 1) { "child2_index" ) - for (i in seq_along(output_trppl[1][[1]])) { + #for (i in seq_along(output_trppl[1][[1]])) { + for (i in seq_along(output_trppl$samples)) { res <- data.frame(matrix(ncol = nbr_col, nrow = 0)) colnames(res) <- c( "iteration", @@ -221,3 +227,98 @@ peel_tree <- function(subtree, } result } + + +#' Check for convergence across multiple SMC sweeps/runs +#' +#' @param treeppl_out a character vector giving the TreePPL json output +#' produced by [tp_treeppl]. +#' +#' @returns Variance in the normalizing constants across SMC sweeps. +#' @export +#' +tp_smc_convergence <- function(treeppl_out) { + + output <- tp_parse(treeppl_out) + zs <- output %>% + dplyr::slice_head(n = 1, by = run) %>% + dplyr::pull(norm_const) + + return(var(zs)) +} + + + +#' Find the Maximum A Posteriori (MAP) Tree from weighted samples +#' +#' @param trees_out The list returned by [treepplr::tp_json_to_phylo] +#' (containing $trees and $weights) +#' +#' @returns The MAP tree as a phylo object +#' @export +#' +tp_map_tree <- function(trees_out) { + + trees <- trees_out$trees + weights <- trees_out$weights + + # Handle Log-Weights (Optional Check) + # If weights are negative (log-scale), convert them to probabilities first + if (any(weights < 0)) { + message("Log-weights detected. converting to relative probabilities...") + # Subtract max to avoid underflow/overflow issues + weights <- exp(weights - max(weights)) + } + + # Identify unique topologies + trees_ready <- lapply(trees, function(tree) { + + # normalize edge lengths for the tip reordering + tree$edge.length <- tree$edge.length/max(tree$edge.length) + # Ladderize to fix edge indices + tree_lad <- ladderize_tree(tree) + # Order tip labels as similarly as possible + tree_ord <- bnpsd::tree_reorder(tree_lad, sort(tree_lad$tip.label)) + # Remove edge lengths to only focus on topology + tree_ord$edge.length <- NULL + return(tree_ord) + + }) + + # This compresses the list into unique tree topologies + unique_topologies <- ape::unique.multiPhylo(trees_ready, use.edge.length = FALSE) + + # Map every original tree to a unique topology index + #match_indices <- match(trees_ready, unique_topologies) + match_indices <- attr(unique_topologies, "old.index") + + # Sum weights for each unique topology + # tapply splits the weights by the index and sums them + topology_probs <- tapply(weights, match_indices, sum) + + # Identify the Best Topology + best_index <- as.numeric(names(which.max(topology_probs))) + + # Calculate posterior probability of this MAP topology + map_prob <- max(topology_probs) / sum(topology_probs) + + # Compute Mean Branch Lengths for the MAP Topology + # We take all samples that matched the MAP topology... + matching_indices <- which(match_indices == best_index) + matching_trees <- trees[matching_indices] + matching_weights <- weights[matching_indices] + + # ...and compute a consensus to average their branch lengths. + # Ideally, we should do a weighted average of the lengths, + # but ape::consensus uses simple mean. For most purposes, this is sufficient. + final_map <- map <- phangorn::allCompat(matching_trees, rooted=TRUE) |> + phangorn::add_edge_length(matching_trees, + fun = function(x) weighted.mean(x, matching_weights)) + + print(paste("MAP Topology found")) + print(paste("Posterior Probability:", round(map_prob, 4))) + print(paste("Based on the topology of", length(matching_indices), + "samples out of", length(trees))) + + return(final_map) +} diff --git a/R/runner.R b/R/runner.R index be7a5e2..a83da11 100644 --- a/R/runner.R +++ b/R/runner.R @@ -105,7 +105,7 @@ #' and manual (sample only at manually defined resampling locations). #' Use 'NULL' to ignore. #' -#' @return TreePPL output in JSON format. +#' @return A list of TreePPL output in parsed JSON format. #' @export tp_treeppl <- @@ -127,8 +127,11 @@ tp_treeppl <- prune = FALSE, subsample = NULL, resample = NULL) { + tp_write(model, model_file_name, data, data_file_name) + if (compile_model) { + tp_compile( model_file_name, samples, @@ -145,6 +148,7 @@ tp_treeppl <- resample ) } + return(tp_run(model_file_name, data_file_name, n_runs)) } @@ -360,7 +364,7 @@ tp_compile <- function(model_file_name = "tmp_model_file", #' #' @param model_file_name a character vector giving a model name. #' @param data_file_name a character vector giving a data name. -#' @param n_runs a [base::integer] giving the number of run (mcmc)/sweap (smc). +#' @param n_runs a [base::integer] giving the number of runs (mcmc)/sweaps (smc). #' #' @details #' @@ -372,16 +376,17 @@ tp_compile <- function(model_file_name = "tmp_model_file", #' a data name. Use a [treepplr::tp_stored_data] name if you have already write #' your data with [treepplr::tp_treeppl]. #' -#' `samples` : The number of samples (mcmc) / particules (smc) during inference. -#' #' `n_runs` : The number of run (mcmc) / sweap (smc) used for the inference. #' -#' @return TreePPL output in JSON format. +#' @return A list of TreePPL output in parsed JSON format. #' @export tp_run <- function(model_file_name = "tmp_model_file", data_file_name = "tmp_data_file", - n_runs = "1") { + n_runs = 1) { + + n_runs_string <- paste0("--sweeps ", n_runs, " ") + # if dir_path = NULL return temp_dir, if not return dir dir_path <- tp_tempdir() @@ -389,10 +394,13 @@ tp_run <- function(model_file_name = "tmp_model_file", # Empty LD_LIBRARY_PATH from R_env for this command specifically # due to conflict with internal env from treeppl self container command <- paste0("LD_LIBRARY_PATH= MCORE_LIBS= ", dir_path, model_file_name, ".exe ", - dir_path, data_file_name, ".json ", n_runs, + dir_path, data_file_name, ".json ", n_runs_string, paste(">", paste0(dir_path, model_file_name, "_out.json") )) system(command) - return(jsonlite::fromJSON(paste0(dir_path, model_file_name, "_out.json"), simplifyVector = FALSE)) + json_out <- readLines(paste0(dir_path, model_file_name, "_out.json")) %>% + lapply(jsonlite::fromJSON, simplifyVector = FALSE) + + return(json_out) } diff --git a/R/tree_convertion.R b/R/tree_convertion.R index e770a6d..f77ecfd 100644 --- a/R/tree_convertion.R +++ b/R/tree_convertion.R @@ -1,25 +1,25 @@ #' Convert phylo obj to TreePPL tree #' #' @description -#' `tp_phylo_2_TreePPL` takes an object of class `phylo` and returns +#' `tp_phylo_to_tpjson` takes an object of class `phylo` and returns #' a TreePPL json str ready to print in a data file for TreePPL use. #' #' @param phylo_tree an object of class [ape::phylo]. -#' @param age a string that determine the way the age of the node are +#' @param age a string that determines the way the age of the nodes are #' calculated (default "branch-length"). #' #' * "branch-length" : Root's age = NA, branch-length from the parent branch -#' * "top-down" : Root's age = 0.0, cumulative branch-length from root -#' * "down-top" : Leaf's age = 0.0, cumulative branch-length from leaf +#' * "root-to-tip" : Root's age = 0.0, cumulative branch-length from root +#' * "tip-to-root" : Leaf's age = 0.0, cumulative branch-length from leaf #' #' @return A TreePPL json str #' #' @export -tp_phylo_2_TreePPL <- function(phylo_tree, age = "") { +tp_phylo_to_tpjson <- function(phylo_tree, age = "") { name <- "tree" - root_tree <- tp_phylo_2_tppl_tree(phylo_tree) + root_tree <- tp_phylo_to_tppl_tree(phylo_tree) if (age != "branch-length") { tree <- tree_age_cumul(root_tree[[2]], root_tree[[1]], age) @@ -29,13 +29,13 @@ tp_phylo_2_TreePPL <- function(phylo_tree, age = "") { tree = tree, root_index = root_tree[[1]]) - tp_TreePPL_json(json_df) + tp_treeppl_json(json_df) } #' Convert phylo to a tppl_tree #' #' @description -#' `tp_phylo_2_tppl_tree` takes an object of class `phylo` and returns +#' `tp_phylo_to_tppl_tree` takes an object of class `phylo` and returns #' a tppl_tree. #' #' @param phylo_tree an object of class [ape::phylo]. @@ -43,7 +43,7 @@ tp_phylo_2_TreePPL <- function(phylo_tree, age = "") { #' @return A pair (root index, tppl_tree) #' -tp_phylo_2_tppl_tree <- function(phylo_tree) { +tp_phylo_to_tppl_tree <- function(phylo_tree) { name <- deparse(substitute(phylo_tree)) df_ <- tidytree::as_tibble(phylo_tree) @@ -88,13 +88,13 @@ tp_phylo_2_tppl_tree <- function(phylo_tree) { #' `tree_age_cumul` takes an tppl_tree and returns a json with #' branch length accumulation as node's age. #' -#' @param tree an tppl_tree create with [treepplr::tp_phylo_2_tppl_tree]. +#' @param tree an tppl_tree create with [treepplr::tp_phylo_to_tppl_tree]. #' @param root_index the index of the root in the tppl_tree. #' @param age a string that determine the way the age of the node are #' calculated (default "branch-length"). #' -#' * "top-down" : Root's age = 0.0, cumulative branch-length from root -#' * "down-top" : Leaf's age = 0.0, cumulative branch-length from leaf +#' * "root-to-tip" : Root's age = 0.0, cumulative branch-length from root +#' * "tip-to-root" : Leaf's age = 0.0, cumulative branch-length from leaf #' #' @return A tppl_tree @@ -103,7 +103,7 @@ tree_age_cumul <- function(tree, root_index, age = "branch-length") { down <- TRUE going_left <- c(TRUE, NA) i <- root_index - if (age == "top-down") { + if (age == "root-to-tip") { tree[i, "Age"] <- 0.0 age_cumul[i] <- 0.0 } @@ -118,13 +118,13 @@ tree_age_cumul <- function(tree, root_index, age = "branch-length") { i <- tree[i, "Right"] } going_left <- c(TRUE, going_left) - if (age == "top-down") { + if (age == "root-to-tip") { age_cumul[i] <- age_cumul[parent[1]] + tree[i, "Age"] } } else { down <- FALSE going_left <- going_left[-1] - if (age == "down-top") { + if (age == "tip-to-root") { age_cumul[i] <- 0.0 } } @@ -134,7 +134,7 @@ tree_age_cumul <- function(tree, root_index, age = "branch-length") { if (going_left[1]) { down <- TRUE going_left[1] <- FALSE - if (age == "down-top") { + if (age == "tip-to-root") { age_cumul[i] <- age_cumul[tree[i, "Left"]] + tree[tree[i, "Left"], "Age"] } } else { @@ -150,7 +150,7 @@ tree_age_cumul <- function(tree, root_index, age = "branch-length") { #' Convert a tppl_tree to TreePPL json str #' #' @description -#' `tp_TreePPL_json` takes an tppl_tree create with [treepplr::tp_phylo_2_tppl_tree] +#' `tp_treeppl_json` takes an tppl_tree create with [treepplr::tp_phylo_to_tppl_tree] #' and returns a list ready to be exported with JSON #' #' @param tree an object of class json. @@ -158,7 +158,7 @@ tree_age_cumul <- function(tree, root_index, age = "branch-length") { #' @return A TreePPL json str #' @export #' -tp_TreePPL_json <- function(tree) { +tp_treeppl_json <- function(tree) { pjs_list <- list(rec_tree_list(tree$tree, tree$root_index)) names(pjs_list) <- tree$name @@ -181,3 +181,144 @@ rec_tree_list <- function(tree, row_index) { pjs_list } + + + +#' Convert TreePPL multi-line JSON to R phylo/multiPhylo object with associated +#' weights +#' +#' @description +#' `tp_json_to_phylo` takes the path to a TreePPL json output and returns an +#' object of class `phylo`. +#' +#' @param json_out One of two options: +#' * A list of TreePPL output in parsed JSON format (output from +#' [treepplr::tp_treeppl()]), OR +#' * The full path of the json file containing the TreePPL output. +#' +#' @return A list with two elements: +#' $trees: A 'phylo' object (if one tree) or 'multiPhylo' object (if multiple). +#' $weights: A numeric vector of sample weights. +#' @export +tp_json_to_phylo <- function(json_out) { + + res <- try(file.exists(json_out), silent = TRUE) + # If path exists, import output from file + if (!is(res, "try-error") && res) { + + ## Read lines and parse each line as a separate JSON object + raw_lines <- readLines(json_out, warn = FALSE) + + # Filter out empty lines just in case + raw_lines <- raw_lines[raw_lines != ""] + + json_list <- lapply(raw_lines, function(x) { + jsonlite::fromJSON(x, simplifyVector = FALSE)}) + + # If path doesn't exist, then it should be a list + } else if (is.list(json_out)) { + json_list <- json_out + } else { + stop("Incorrect input format") + } + + ## Process each tree in the list + newick_strings <- sapply(json_list, function(sweep) { + + # loop over all particles within the sweep + samples <- sweep$samples + + sweep_string <- sapply(samples, function(particle) { + + # Extract the root age + root_data <- particle[[1]]$`__data__` + root_age <- root_data$age + + # Start recursion from the children of the root. + # The root itself does not have a branch length in standard Newick, + # but it provides the 'parent_age' for its immediate children. + + left_str <- build_newick_node(root_data$left, root_age) + right_str <- build_newick_node(root_data$right, root_age) + + # Construct final Newick string: (ChildL, ChildR); + paste0("(", left_str, ",", right_str, ");") + }) + + }) + + # 4. Convert to phylo + trees <- ape::read.tree(text = newick_strings) + + # 5. Get weight for each tree + nweight_matrix <- sapply(json_list, function(sweep) { + + nconst <- sweep$normConst + logweights <- unlist(sweep$weights) + log_nw <- nconst + logweights + + }) + + nweights <- as.vector(nweight_matrix) + norm_weights <- exp(nweights - max(nweights)) + + return(list(trees = trees, weights = norm_weights)) +} + + +# 2. Recursive function to build Newick string +# 'parent_age' is passed down from the caller +build_newick_node <- function(node, parent_age) { + + type <- node[["__constructor__"]] + data <- node[["__data__"]] + + # Ensure ages are numeric + parent_age <- as.numeric(parent_age) + + if (type == "Leaf") { + # Rule: Leaf branch length is the age of its parent node + len <- parent_age + + if (is.null(data$label)){ + label <- data$index + } else { + label <- data$label + } + + # Return "Label:Length" + return(paste0(label, ":", len)) + + } else if (type == "Node") { + # Rule: Internal node branch length = Parent Age - Own Age + current_age <- as.numeric(data$age) + len <- parent_age - current_age + + # Recursively process children, passing CURRENT age as their 'parent_age' + left_str <- build_newick_node(data$left, current_age) + right_str <- build_newick_node(data$right, current_age) + + # Return "(Left,Right):Length" + return(paste0("(", left_str, ",", right_str, "):", len)) + } +} + + + +# Function to ladderize tree and correct tip label sequence +ladderize_tree <- function(tree, temp_file = "temp", orientation = "left"){ + if(file.exists(paste0("./", temp_file))){ + stop("The chosen temporary file exists! Please choose an other temp_file name") + } + if(orientation == "left"){ + right <- FALSE + }else{ + right <- TRUE + } + tree_temp <- ape::ladderize(tree, right = right) + ape::write.tree(tree_temp, file = paste0("./", temp_file, ".tre")) + tree_lad <- ape::read.tree(paste0("./", temp_file, ".tre")) + file.remove(paste0("./", temp_file, ".tre")) + return(tree_lad) +} + diff --git a/R/treepplr-package.R b/R/treepplr-package.R index fb0505e..c53e54c 100644 --- a/R/treepplr-package.R +++ b/R/treepplr-package.R @@ -2,6 +2,8 @@ "_PACKAGE" ## usethis namespace: start +#' @importFrom bnpsd tree_reorder +#' @importFrom magrittr %>% #' @importFrom methods is ## usethis namespace: end NULL diff --git a/R/utils.R b/R/utils.R index 111133b..37f4dab 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,43 +1,3 @@ -#' Temporary directory for running treeppl -#' -#' @description -#' `tp_tempdir` returns a normalized path for a temporary directory where the -#' executables can read and write temporary files. -#' -#' @param temp_dir NULL, or a path to be used; if NULL, R's [base::tempdir] -#' is used. -#' @param sep Better ignored; non-default values are passed to -#' [base::normalizePath]. -#' @param sub Extension for defining a sub-directory within the directory -#' defined by [base::tempdir]. -#' -#' @return Normalized path with system-dependent terminal separator. -#' @export - -tp_tempdir <- function(temp_dir = NULL, - sep = NULL, - sub = NULL) { - if (is.null(sep)) { - sep <- sep() - if (is.null(temp_dir)) - temp_dir <- tempdir() - temp_dir <- normalizePath(temp_dir, sep) - if (substring(temp_dir, nchar(temp_dir)) != sep) { - temp_dir <- paste0(temp_dir, sep) - } - if (!is.null(sub)) - temp_dir <- paste0(temp_dir, sub, sep) - if (!dir.exists(temp_dir)) - dir.create(temp_dir) - temp_dir - } -} - -# Platform-dependent separator character -sep <- function() { - .Platform$file.sep -} - #' Fetch the latest version of treeppl #' @export @@ -64,8 +24,11 @@ tp_fp_fetch <- function() { file_name <- list.files(path = system.file(folder_name, package = "treepplr"), full.names = TRUE) # download file if file_name is empty if (length(file_name) == 0) { + # create destination folder + dest_folder <- paste(system.file(package = "treepplr"), folder_name, sep = "/") + system(paste("mkdir", dest_folder)) # download - fn <- paste(system.file(folder_name, package = "treepplr"), asset$name, sep = "/") + fn <- paste(dest_folder, asset$name, sep = "/") curl::curl_download( asset$browser_download_url, destfile = fn, @@ -78,7 +41,7 @@ tp_fp_fetch <- function() { # remove old file file.remove(file_name) # download - fn <- paste(system.file(folder_name, package = "treepplr"), asset$name, sep = "/") + fn <- paste(system.file(package = "treepplr"), folder_name, asset$name, sep = "/") curl::curl_download( asset$browser_download_url, destfile = fn, @@ -112,6 +75,70 @@ installing_treeppl <- function() { tpplc_path } + + +#' Temporary directory for running treeppl +#' +#' @description +#' `tp_tempdir` returns a normalized path for a temporary directory where the +#' executables can read and write temporary files. +#' +#' @param temp_dir NULL, or a path to be used; if NULL, R's [base::tempdir] +#' is used. +#' @param sep Better ignored; non-default values are passed to +#' [base::normalizePath]. +#' @param sub Extension for defining a sub-directory within the directory +#' defined by [base::tempdir]. +#' +#' @return Normalized path with system-dependent terminal separator. +#' @export + +tp_tempdir <- function(temp_dir = NULL, + sep = NULL, + sub = NULL) { + if (is.null(sep)) { + sep <- sep() + if (is.null(temp_dir)) + temp_dir <- tempdir() + temp_dir <- normalizePath(temp_dir, sep) + if (substring(temp_dir, nchar(temp_dir)) != sep) { + temp_dir <- paste0(temp_dir, sep) + } + if (!is.null(sub)) + temp_dir <- paste0(temp_dir, sub, sep) + if (!dir.exists(temp_dir)) + dir.create(temp_dir) + temp_dir + } +} + +# Platform-dependent separator character +sep <- function() { + .Platform$file.sep +} + + +#' Model names supported by treepplr +#' +#' @description Provides a list of all model names supported by treepplr. +#' The names can also be used to find data for these models +#' (see [treepplr::tp_data]). +#' +#' @return A list of model names. +#' @export +tp_model_names <- function() { + list( + custom = "custom", + coin = "coin", + hostrep3states = "hostrep3states", + hostrep2states = "hostrep2states", + tree_inference = "tree_inference", + crbd = "crbd", + clads = "clads" + ) +} + + # Find model and data files for model_name find_file <- function(model_name, exten) { if (exten == "tppl") { @@ -185,25 +212,6 @@ stored_files <- function(exten) { list } -#' Model names supported by treepplr -#' -#' @description Provides a list of all model names supported by treepplr. -#' The names can also be used to find data for these models -#' (see [treepplr::tp_data]). -#' -#' @return A list of model names. -#' @export -tp_model_names <- function() { - list( - custom = "custom", - coin = "coin", - hostrep3states = "hostrep3states", - hostrep2states = "hostrep2states", - tree_inference = "tree_inference", - crbd = "crbd", - clads = "clads" - ) -} #' Create a flat list #' @@ -229,3 +237,6 @@ tp_list <- function(...) { dotlist } + + + diff --git a/inst/extdata/README.txt b/inst/extdata/README.txt new file mode 100644 index 0000000..1f7c7e6 --- /dev/null +++ b/inst/extdata/README.txt @@ -0,0 +1,17 @@ +### The files included in this folder come from the treeppl repository. + + +## Tree inference + +model from https://github.com/treeppl/treeppl/blob/main/models/tree-inference/tree_inference_pruning.tppl + +data from https://github.com/treeppl/treeppl/blob/main/models/tree-inference/data/testdata_tree_inference_pruning.json + + +## Host repertoire + +model from https://github.com/treeppl/treeppl/blob/main/models/host-repertoire-evolution/flat-root-prior-HRM.tppl + +data from https://github.com/treeppl/treeppl/blob/main/models/host-repertoire-evolution/data/testdata_flat-root-prior-HRM.json + + diff --git a/inst/extdata/hostrep3states.json b/inst/extdata/hostrep3states.json index fb63080..b9f5c88 100644 --- a/inst/extdata/hostrep3states.json +++ b/inst/extdata/hostrep3states.json @@ -2,25 +2,53 @@ "symbiont_tree": { "__constructor__": "Node", "__data__": { - "label": 7, - "age": 2.0, + "label": 11, + "age": 10.0, "left": { "__constructor__": "Node", "__data__": { - "label": 6, - "age": 0.735218, + "label": 9, + "age": 3.360348, "left": { - "__constructor__": "Leaf", + "__constructor__": "Node", "__data__": { - "label": 4, - "age": 0.0 + "label": 7, + "age": 1.279190, + "left": { + "__constructor__": "Leaf", + "__data__": { + "label": 1, + "age": 0.0 + } + }, + "right": { + "__constructor__":"Leaf", + "__data__": { + "label":2, + "age":0.0 + } + } } }, "right": { - "__constructor__":"Leaf", + "__constructor__": "Node", "__data__": { - "label":3, - "age":0.0 + "label": 8, + "age": 0.153814, + "left": { + "__constructor__": "Leaf", + "__data__": { + "label": 3, + "age": 0.0 + } + }, + "right": { + "__constructor__": "Leaf", + "__data__": { + "label": 4, + "age": 0.0 + } + } } } } @@ -28,35 +56,38 @@ "right": { "__constructor__": "Node", "__data__": { - "label": 5, - "age": 0.428200, + "label": 10, + "age": 1.079144, "left": { "__constructor__": "Leaf", "__data__": { - "label": 2, + "label": 5, "age": 0.0 } }, "right": { "__constructor__": "Leaf", "__data__": { - "label": 1, + "label": 6, "age": 0.0 } - } + } } - } + } } }, - "ntips":4, - "nhosts":3, - "interactions":[2,0,2, - 2,0,0, - 2,2,0, - 0,2,0], - "host_distances":[0.0000000, 0.2563668, 3.15567, - 0.2563668, 0.0000000, 3.15567, - 3.1556704, 3.1556704, 0.00000], - "dMean":2.189236, - "tune":0.9 + "ntips":6, + "nhosts":5, + "interactions":[2,2,0,0,0, + 2,2,0,0,0, + 0,0,2,2,0, + 0,0,2,2,0, + 0,0,0,0,2, + 0,0,0,0,2], + "host_distances":[0.0,0.8630075756,2.6699063134,2.6699063134,2.6699063134, + 0.8630075756,0.0,2.6699063134,2.6699063134,2.6699063134, + 2.6699063134,2.6699063134,0.0,1.2256551506,1.9598979474, + 2.6699063134,2.6699063134,1.2256551506,0.0,1.9598979474, + 2.6699063134,2.6699063134,1.9598979474,1.9598979474,0.0], + "dMean":4.405579 } diff --git a/inst/extdata/hostrep3states.tppl b/inst/extdata/hostrep3states.tppl index 167ff60..fde6526 100644 --- a/inst/extdata/hostrep3states.tppl +++ b/inst/extdata/hostrep3states.tppl @@ -1,93 +1,29 @@ -/* - * Host repertoire model - * - * This file implements the host repertoire model described by Braga et al - * 2020 (https://doi.org/10.1093/sysbio/syaa019). - * - * This version explicitly repays weight associated with proposal - * - * Compile - * tpplc host_repertoire.tppl -m smc-apf - * - * Run - * ./out treeppl/models/data/hostrep.json 10 1 - */ - -/*---------------------------- TYPES --------------------------------*/ - -// TreeLabeled is like the built in Tree type, but with leaf and node labels -type TreeLabeled = - | Leaf{age: Real, label: Int} - | Node{age: Real, left: TreeLabeled, right: TreeLabeled, label: Int} - -/*-------------------- Model-specific types -------------------------*/ - -type ModelParams = - ModelParams{qMatrix: Matrix[Real], dMatrix: Matrix[Real], dMean: Real, beta: Real} - -// Tree decorated with postorder messages from belief propagation. -// Postorder = backwards in time -type MsgTree = - | MsgLeaf{age: Real, label: Int, out_msg: Matrix[Real][]} - | MsgNode{age: Real, label: Int, left: MsgTree, right: MsgTree, - left_in_msg: Matrix[Real][], right_in_msg: Matrix[Real][], out_msg: Matrix[Real][]} - -// Tree decorated with final probabilities from belief propagation, -// tuned so that they are useful for proposals. -type ProbsTree = - | ProbsLeaf{age: Real, label: Int, probs: Matrix[Real][]} - | ProbsNode{age: Real, label: Int, probs: Matrix[Real][], - left: ProbsTree, right: ProbsTree} - -// Event type used to store information about one host-repertoire change -// We keep the from_state here just in case we want to use it for debugging -type Event = Event{age: Real, host: Int, from_state: Int, to_state: Int} - -// Type used to hold a proposed history of state changes for a branch -type ProposedHistory = ProposedHistory{log_debt: Real, events: Event[]} - -// History point type used to store repertoire snapshots at the repertoire -// start ages, which is the time of the event that led to that combination -type HistoryPoint = HistoryPoint{age: Real, repertoire: Int[]} - -// HistoryScore type used to store a vector of history points and a log -// score associated with this history. -type HistoryScore = HistoryScore{log_score: Real, history: HistoryPoint[]} - -// Tree decorated with ancestral host repertoires and branch repertoire histories -type HistoryTree = - | HistoryLeaf{age: Real, label: Int, repertoire: Int[], history: HistoryPoint[], log_rep_debt: Real} - | HistoryNode{age: Real, label: Int, repertoire: Int[], history: HistoryPoint[], log_rep_debt: Real, - left: HistoryTree, right: HistoryTree} - -// Type used to hold the character histories and parameter values to be outputed -type ReturnType = ReturnType{tree: HistoryTree, lambda: Real[], mu: Real, beta: Real} - -/* - * Model - * - * @param lambda: vector of transition rates - * lambda[1]:0->1, lambda[2]:1->0, lambda[3]:1->2, lambda[4]:2->1 - * Transition rates are given as proportions of the total - * off-diagonal rate, that is, the sum of lambdas is 1.0 - * - * @param mu: rescale all rates in lambda with mu. Referred to as the maximum - * rate (or rate, for short) of host repertoire evolution, since - * 0≤mu*lambda_i≤mu for all i (all lambdas <= 1.0). - * - * @param beta: phylogenetic distance parameter, determining strength of the - * host distance effect on host repertoire evolution. - */ - -model function mymodel(symbiont_tree: TreeLabeled, ntips: Int, nhosts: Int, interactions: Int[], - host_distances: Real[], dMean: Real, tune: Real) => ReturnType { - - // Set priors for model parameters - assume lambda ~ Dirichlet([1.0,1.0,1.0,1.0]); - assume mu ~ Exponential(10.0); - assume beta ~ Exponential(1.0); - - // Set transition rate matrix for three-state independence model +import "host-rep-lib/lib.tppl" +import "host-rep-lib/container-types.tppl" +import "host-rep-lib/helpers.tppl" +import "host-rep-lib/belief-propagation.tppl" +import "host-rep-lib/dist-helpers.tppl" +import "host-rep-lib/full-model.tppl" +import "host-rep-lib/rb-drift-kernels.tppl" +import "host-rep-lib/independence-model.tppl" + +type ReturnType = ReturnType{ + lambda: Real[], mu: Real, beta: Real, tree: HistoryTree +} + + +model function rejectAccept( + symbiont_tree: TreeLabeled, + ntips: Int, + nhosts: Int, + interactions: Int[], + host_distances: Real[], + dMean: Real +) => ReturnType { + assume lambda ~ Dirichlet([1., 1., 1., 1.]) drift rbLambdaMove(lambda); + assume mu ~ Exponential(10.) drift rbMuMove(mu); + assume beta ~ Exponential(1.) drift rbBetaMove(beta); + let r = mtxCreate(3,3, [0.-lambda[1], lambda[1], 0.0, lambda[2], 0.-(lambda[2]+lambda[3]), lambda[3], @@ -95,535 +31,384 @@ model function mymodel(symbiont_tree: TreeLabeled, ntips: Int, nhosts: Int, inte ); let qMatrix = mtxSclrMul(mu, r); - // Calculate stationary probabilities for the rate matrix q - let stationary_probs = stationaryProbs(lambda); - - let dMatrix = mtxCreate(nhosts, nhosts, host_distances); - - // Package model parameters into an object for convenient calling - let mp = ModelParams{qMatrix = qMatrix, dMatrix = dMatrix, dMean = dMean, beta = beta}; - - let interactions_reals = sint2real(interactions); - let iMatrix = mtxCreate(ntips, nhosts, interactions_reals); - - // Derive params for ancestral repertoire proposals from independence model - let probs_tree = get_proposal_params(symbiont_tree, iMatrix, qMatrix, stationary_probs, tune); - - // Propose a repertoire for the root of the symbiont tree - // based on the state probabilities from the belief propagation - let probRecordSequence = sapply(probs_tree.probs, createProbRecord); - let root_rep = sapply(probRecordSequence, categorical); - - let log_root_rep_debt = seqSumReal(zipWith(categoricalLogScore, root_rep, probRecordSequence)); - // Note: categorical works here because root_rep id 0-based - - let valid_root = any(is2, root_rep); - - if !valid_root { - weight 0.0; - resample; - } - - let log_score_root = - ( log( (3.0 ^ Real(nhosts)) - (2.0 ^ Real(nhosts)) ) ); - - - // Simulate repertoire history on the symbiont tree (left and right daughters) - let left = simulate(probs_tree.left, HistoryPoint{age = probs_tree.age, repertoire = root_rep}, mp, iMatrix); - let right = simulate(probs_tree.right, HistoryPoint{age = probs_tree.age, repertoire = root_rep}, mp, iMatrix); - - // Root node doesn't have any history (no subroot branch) - let historyTree = HistoryNode{age = probs_tree.age, label = probs_tree.label, - repertoire = root_rep, history = [], log_rep_debt = log_root_rep_debt-log_score_root, - left = left, right = right}; - - // Factor in the net debt of the proposed ancestral repertoires - // at nodes and leaves in the historyTree - logWeight( -get_rep_debt(historyTree) ); - - // Return symbiont tree with simulated character history and parameter values - return ReturnType{tree = historyTree, lambda = lambda, mu = mu, beta = beta}; -} - - -/*---------------------------- FUNCTIONS --------------------------------*/ - - -/*---------------- Proposal distribution parameters ---------------------*/ - -/* - * get_proposal_params - * - * This functions uses belief propagation under the independence - * model, and an anneling parameter 'tune', to obtain parameters - * for the proposals of ancestral repertoires. - * - * Parameters: - * @param symbiont_tree: symbiont tree - * @param iMatrix: interaction matrix - * @param qMatrix: rate matrix for the independence model - * @param stationary_probs: stationary probabilities for the independence model - * @param tune: parameter in (0,1) determining how much - * we should trust the independence model - * - * Return value: - * A ProbsTree containing the final repertoire state proposal - * probabilities for all nodes in the symbiont tree. - */ -function get_proposal_params(symbiont_tree: TreeLabeled, iMatrix: Matrix[Real], qMatrix: Matrix[Real], - stationary_probs: Matrix[Real], tune: Real) => ProbsTree { - - let msgTree = postorder_msgs(symbiont_tree, iMatrix, qMatrix); - let n = length(getMsgTreeMsg(msgTree)); - let pis = rep(n, stationary_probs); - return final_probs(msgTree, pis, qMatrix, tune); -} - -// Compute postorder messages on the observed tree -function postorder_msgs(tree: TreeLabeled, iMatrix: Matrix[Real], qMatrix: Matrix[Real]) => MsgTree { + let nestedInteractions = nestSeq(interactions, ntips, nhosts); + let postorderTree = postorderTraverse(symbiont_tree, qMatrix, nestedInteractions, nhosts); + + // Sample the root. We assume in both the independence and full model + // that we have a flat prior on the root + let rootPrior = mtxCreate(nhosts, 3, ones(3 * nhosts)); + let rootSamplingProb = mtxElemMul(postorderTree.outMsg, rootPrior); + let initRootRep = suggestRepAligned(rootSamplingProb, 1, nhosts); + let rootRep = suggestRepRS(rootSamplingProb, nhosts, initRootRep, 0); + + // Calculate the debt and excess + let rootLogDebt = getRepertoireDebt(rootRep, rootSamplingProb, nhosts); + let rootLogExcess = -log((3.^(Real(nhosts))) - (2.^Real(nhosts))); + + // This is an aligned point so we can weight by the importance ratio + logWeight rootLogExcess - rootLogDebt; + + // Compute messages to pass to pass to the children of the root + let newMsg = mtxCreate(nhosts, 3, observationMessage(rootRep, 1, nhosts)); + let leftMsg = mtxMul(newMsg, postorderTree.leftKernel); + let rightMsg = mtxMul(newMsg, postorderTree.rightKernel); + + // Construct an object containing the rate matrix in a suitable format for sampling + // from the embedded Markov chain + + let embeddedQMatrix = rateMatrixToEmbeddedMarkovChain(qMatrix); + let modelParams = ModelParams { + beta = beta, + hostMetric = mtxCreate(nhosts, nhosts, host_distances), + embeddedQMatrix = embeddedQMatrix, + meanDist = dMean + }; + let rootAge = postorderTree.age; + let leftRepertoireTree = sampleTreeHistory( + postorderTree.left, nhosts, leftMsg, rootRep, rootAge, modelParams, postorderTree.leftKernel + ); + let rightRepertoireTree = sampleTreeHistory( + postorderTree.right, nhosts, rightMsg, rootRep, rootAge, modelParams, postorderTree.rightKernel + ); - if tree is Leaf { - return MsgLeaf{age = 0.0, label = tree.label, out_msg = observationMessage(mtxGetRow(tree.label, iMatrix))}; + // Construct the root node of the repertoire tree + let historyTree = HistoryNode { + age = symbiont_tree.age, label = symbiont_tree.label, + left = leftRepertoireTree, right = rightRepertoireTree, + repertoire = rootRep, history = [] + }; + return ReturnType { + lambda = lambda, + mu = mu, + beta = beta, + tree = historyTree + }; +} + +function suggestRepAligned(msg: Matrix[Real], i: Int, max: Int) => Int[] { + if i <= max { + let param = mtx3ToSeq(msg, i); + assume x ~ Categorical(param) drift categoricalMove(x, param); + return cons(x, suggestRepAligned(msg, i + 1, max)); + } else { + return []; } - - let left = postorder_msgs(tree.left ,iMatrix, qMatrix); - let right = postorder_msgs(tree.right,iMatrix, qMatrix); - - let tMatrixLeft = mtxTrans(mtxExp(mtxSclrMul(tree.age-left.age, qMatrix))); - let tMatrixRight = mtxTrans(mtxExp(mtxSclrMul(tree.age-right.age, qMatrix))); - - let left_in_msg = sapply1(left.out_msg, mtxMul, tMatrixLeft); - let right_in_msg = sapply1(right.out_msg, mtxMul, tMatrixRight); - - let out_msg = messageNormalize(messageElemMul(left_in_msg, right_in_msg)); - - return MsgNode{age= tree.age, label= tree.label, left= left, right= right, - left_in_msg = left_in_msg, right_in_msg = right_in_msg, out_msg = out_msg}; - } -// Compute leaf message from observed interactions -function observationMessage(obs_repertoire: Matrix[Real]) => Matrix[Real][] { - return tapply(obs_repertoire, makeStateMessage); -} - -// Takes a single interaction and creates the message -// Interactions are of type Real because of limitations in boolean comparisons -function makeStateMessage(interaction: Real) => Matrix[Real] { - // NOTE: we do not have a switch statement, perhaps we need in the future? - - if (interaction == 0.0) { - return rvecCreate(3, [1., 0., 0.]); +function suggestRepRS(msg: Matrix[Real], max: Int, initialRep: Int[], depth: Int) => Int[] { + let _REP_REJECTION_DEPTH = 10; + if any(is2, initialRep) { + return initialRep; } else { - if (interaction == 1.0) { - return rvecCreate(3, [0., 1., 0.]); - } else { - if (interaction == 2.0) { - return rvecCreate(3, [0., 0., 1.]); - } else { - return rvecCreate(3, [1., 1., 1.]); - } - } - } -} - -// Compute final probabilities from belief propagation on the observed symbiont tree -function final_probs(tree: MsgTree, parent_msg: Matrix[Real][], qMatrix: Matrix[Real], tune: Real) => ProbsTree { - - let probs = messageNormalize( - messageElemPow(messageElemMul(tree.out_msg, parent_msg), tune) - ); - - if tree is MsgLeaf { - return ProbsLeaf{age= 0.0, label= tree.label, probs = probs}; + if depth < _REP_REJECTION_DEPTH { + let newRep = suggestRepUnaligned(msg, 1, max); + return suggestRepRS(msg, max, newRep, depth + 1); + } else { + // Set weight to zero and return dummy value + weight 0.0; + return initialRep; + } } - - let left_branch = tree.age - getMsgTreeAge(tree.left); - let right_branch = tree.age - getMsgTreeAge(tree.right); - - let tMatrixLeft = mtxExp(mtxSclrMul(left_branch, qMatrix)); - let tMatrixRight = mtxExp(mtxSclrMul(right_branch, qMatrix)); - - let left_parent_msg = sapply1(messageElemMul(parent_msg,tree.left_in_msg), mtxMul, tMatrixLeft); - let right_parent_msg = sapply1(messageElemMul(parent_msg,tree.right_in_msg), mtxMul, tMatrixRight); - - let left = final_probs(tree.left, left_parent_msg, qMatrix, tune); - let right = final_probs(tree.right, right_parent_msg, qMatrix, tune); - - return ProbsNode{age = tree.age, label = tree.label, left = left, right = right, probs = probs}; } - -/*---------------- Model functions -------------------------*/ - -// Simulate an evolutionary history for the host repertoires, from root to leaves -function simulate (tree: ProbsTree, start: HistoryPoint, mp: ModelParams, iMatrix: Matrix[Real]) => HistoryTree { - - // Propose a repertoire. The propose mechanism means that the simulation - // should be "penalized" for the probability associated with the draw. - let probs = getProbs(tree); // TODO: is this needed? - let probRecordSequence = sapply(tree.probs, createProbRecord); - let repertoire = sapply(probRecordSequence, categorical); // Note: assume is within function categorical - - let log_rep_debt = seqSumReal(zipWith(categoricalLogScore, repertoire, probRecordSequence)); - // Note: categorical works here because repertoire id 0-based - - let valid_rep = any(is2, repertoire); - if !valid_rep { - weight 0.0; - resample; +function suggestRepUnaligned(msg: Matrix[Real], i: Int, max: Int) => Int[] { + if i <= max { + let param = mtx3ToSeq(msg, i); + assume x ~ Categorical(param); + return paste0([[x], suggestRepUnaligned(msg, i + 1, max)]); + } else { + return []; } - - let stop = HistoryPoint{age = getProbsTreeAge(tree), repertoire = repertoire}; - let history = simulate_history(start, stop, mp); - - if tree is ProbsLeaf { - - let obs_msg = observationMessage(mtxGetRow(tree.label, iMatrix)); - let obsRecordSequence = sapply(obs_msg, createProbRecord); - let log_score = seqSumReal(zipWith(categoricalLogScore, repertoire, obsRecordSequence)); - // Note: categorical works here because repertoire id 0-based - - return HistoryLeaf{age = tree.age, label = tree.label, repertoire = repertoire, - history = history, log_rep_debt = log_rep_debt - log_score}; - } - - let left = simulate(tree.left, stop, mp, iMatrix); - let right = simulate(tree.right, stop, mp, iMatrix); - - return HistoryNode{age = tree.age, label = tree.label, repertoire = repertoire, - history = history, log_rep_debt = log_rep_debt, - left = left, right = right}; - -} - - -// Simulate history conditional on initial repertoire, start time, and end -// time. We first propose events from the independence model and then -// condition the simulation on those events. -function simulate_history (from_rep: HistoryPoint, to_rep: HistoryPoint, mp: ModelParams) => HistoryPoint[] { - - let proposed_unordered_events = propose_events(1, from_rep, to_rep, mp.qMatrix); - // order events by decreasing age - let eventSeq = qSort(subAge, proposed_unordered_events.events); - // Simulate repertoire evolution from full model, one event at a time - let history_score = simulate_by_event(from_rep.repertoire, eventSeq, 1, from_rep.age, - to_rep.age, mp, proposed_unordered_events.log_debt); - - // Now we have both debt and score at an alignment point - logWeight(history_score.log_score - proposed_unordered_events.log_debt); - - return history_score.history; } -// Propose events from independence model -function propose_events (host_index: Int, from_rep: HistoryPoint, to_rep: HistoryPoint, qMatrix: Matrix[Real]) => ProposedHistory { - - // End case, have already done all hosts ins the repertoire - if (host_index > length(from_rep.repertoire)) { - return ProposedHistory{log_debt = 0.0, events = []}; - } - let propHist1 = propose_events_for_host(host_index, from_rep.age, to_rep.age, - from_rep.repertoire[host_index], to_rep.repertoire[host_index], qMatrix); - - let propHist2 = propose_events(host_index + 1, from_rep, to_rep, qMatrix); - - return ProposedHistory{log_debt = propHist1.log_debt + propHist2.log_debt, - events = concat(propHist1.events, propHist2.events)}; - - } - -// Propose events for one host -function propose_events_for_host (host_index: Int, from_age: Real, end_age: Real, - from_state: Int, end_state: Int, qMatrix: Matrix[Real]) => ProposedHistory { +function sampleTreeHistory( + tree: MsgTree, + nhosts: Int, + preorderMsg: Matrix[Real], + parentRep: Int[], + parentAge: Real, + modelParams: ModelParams, + branchKernel: Matrix[Real] +) => HistoryTree { + if tree is MsgLeaf { + // We are at a leaf, so we don't need to sample the interactions + let rep = tree.interactions; + // Sample the incident branch + let branchSample = sampleBranch( + parentRep, + rep, + parentAge, + tree.age, + nhosts, + modelParams, + branchKernel, + 0 + ); + + // We are aligned so we can weight the program here (shouldn't matter for MCMC) + if branchSample.success { + logWeight branchSample.logExcess - branchSample.logDebt; + } else { + // We failed to sample a correct branch, set program weight to zero + weight 0.0; + } - let rate = -(mtxGet(from_state + 1, from_state + 1, qMatrix)); + return HistoryLeaf { + age = tree.age, + label = tree.label, + repertoire = rep, + history = branchSample.history + }; + } else { + let samplingProb = mtxElemMul(tree.outMsg, preorderMsg); + let initRep = suggestRepAligned(samplingProb, 1, nhosts); + let rep = suggestRepRS(samplingProb, nhosts, initRep, 0); + let nodeLogDebt = getRepertoireDebt(rep, samplingProb, nhosts); + + // Sample the incident branch + let branchSample = sampleBranch( + parentRep, + rep, + parentAge, + tree.age, + nhosts, + modelParams, + branchKernel, + 0 + ); + + // We are aligned so we can weight the program here (shouldn't matter for MCMC) + if branchSample.success { + logWeight branchSample.logExcess - branchSample.logDebt - nodeLogDebt; + } else { + // We failed to sample a correct branch, set program weight to zero + weight 0.0; + } - // Kill the particle if we are trying to subdivide a time segment that - // is too short. This will come with a big penalty in either case but - // continuing to run the code will result in numerical problems like - // sampling waiting times that are -0. and getting debts that are inf, - // both of which are difficult to handle. - if (rate * (from_age - end_age) < 1E-15 && from_state != end_state) { - weight 0.0; - resample; + // Since we sampled a repertoire at the node we should propagate this information to the child nodes + let newMsg = mtxCreate(nhosts, 3, observationMessage(rep, 1, nhosts)); + let leftMsg = mtxMul(newMsg, tree.leftKernel); + let rightMsg = mtxMul(newMsg, tree.rightKernel); + + // Sample the two subtrees + let left = sampleTreeHistory( + tree.left, nhosts, leftMsg, rep, tree.age, modelParams, tree.leftKernel + ); + let right = sampleTreeHistory( + tree.right, nhosts, rightMsg, rep, tree.age, modelParams, tree.rightKernel + ); + + return HistoryNode { + age = tree.age, + label = tree.label, + repertoire = rep, + history = branchSample.history, + left = left, + right = right + }; } - - let t = getWaitingTime(from_state, end_state, rate, from_age, end_age); - let new_age = from_age - t; - - // Calculate debt. Note the special case for the last time segment - // with no events. We use the average weight of all particles with - // this outcome, which should be advantageous for inference - // efficiency. It also saves us from storing the actual draw. - if (new_age < end_age) { - let log_wait_debt = poissonLogScore(0, PoissonParam{rate = rate*(from_age - end_age)}); - return ProposedHistory{log_debt = log_wait_debt, events = []}; - } - - let log_wait_debt = getLogWaitDebt(from_state, end_state, rate, from_age, end_age, t); - let to_states = getToStates(from_state); - let state_probs = seqNormalize([mtxGet(from_state + 1, to_states[1] + 1, qMatrix), - mtxGet(from_state + 1, to_states[2] + 1, qMatrix)]); - - assume new_state_pos ~ Categorical(state_probs); - let new_state = to_states[new_state_pos + 1]; - // Note: Categorical is 0-based, so we need to add 1 do get the correct position - let log_event_debt = categoricalLogScore(new_state_pos, CategoricalParam{probs = state_probs}); - // Note: Use position instead of value - - let event = Event{age = new_age, host = host_index, from_state = from_state, to_state = new_state}; - let propHist = propose_events_for_host(host_index, new_age, end_age, new_state, end_state, qMatrix); - - return ProposedHistory{log_debt = log_wait_debt+log_event_debt+propHist.log_debt, events = concat([event], propHist.events)}; - } -function getLogWaitDebt(from_state: Int, end_state: Int, rate: Real, from_age: Real, end_age: Real, t: Real) => Real { - if (from_state != end_state) { - return log_pdf_exponential_max_t(t, rate, from_age - end_age); - } else { - return exponentialLogScore(t, ExponentialParam{rate = rate}); - } -} - -function getWaitingTime(from_state: Int, end_state: Int, rate: Real, from_age: Real, end_age: Real) => Real { - if (from_state != end_state) { - return sample_exponential_max_t(rate, from_age - end_age); - } else { - assume t ~ Exponential(rate); - return t; - } -} - -function getToStates(state: Int) => Int[] { - if (Real(state) == 0.0) { - return [1,2]; - } else { - if (Real(state) == 1.0) { - return [0,2]; - } else { - if (Real(state) == 2.0) { - return [0,1]; - } else { - return error("Invalid state"); +function sampleBranch( + startRep: Int[], + finalRep: Int[], + startAge: Real, + finalAge: Real, + nhosts: Int, + modelParams: ModelParams, + branchKernel: Matrix[Real], + rejectionDepth: Int +) => CorrectedBranchSample { + let _MAX_REJECTION_DEPTH_BRANCH = 100; + if rejectionDepth <= _MAX_REJECTION_DEPTH_BRANCH { + // Sample the events per host + let unorderedBranch = sampleUnorderedBranch( + startRep, finalRep, startAge, finalAge, 1, nhosts, modelParams.embeddedQMatrix, 1 + ); + if unorderedBranch.success { + + // All host histories were sampled correctly, + // compile the events for all hosts and sort them + let allHostEvents = paste0(unorderedBranch.history); + let orderedEvents = qSort(compAge, allHostEvents); + let nEvents = length(orderedEvents); + if allTimesValidBranch(startRep, orderedEvents, 1, nEvents, nhosts) { + // The branch is correct along the full history, + // accept and calculate the true weight of the sample + + // Calculate the debt -- this is the density of the endpoint conditioned + // (parallel) chains at the current sample. + let logDebt = independenceLikelihoodEndCond( + startRep, finalRep, startAge, finalAge, unorderedBranch.history, modelParams, branchKernel + ); + // Calculate the likelihood of the path under the full model + let logExcess = fullModelWeight( + 1, startRep, finalRep, startAge, finalAge, orderedEvents, nEvents, nhosts, modelParams + ); + // Return the branch sample with both the debt and the excess + return CorrectedBranchSample { + history = orderedEvents, + logDebt = logDebt, + logExcess = logExcess, + success = true + }; } - } - } -} - -// Sample from truncated exponential distribution with rate 'rate' -// but with support only on (0, max_t). -function sample_exponential_max_t(rate: Real, max_t: Real) => Real { - let u_min = exp (-(rate * max_t)); - assume u ~ Uniform (u_min, 1.0); - return ((-(log(u))) / rate); -} - -// Compute log PDF for a value from the truncated exponential -// The truncation multiplies the pdf with 1/(1-u_min). -function log_pdf_exponential_max_t(value : Real, rate: Real, max_t: Real) => Real { - let u_min = exp(-(rate * max_t)); - return ( (-(rate * value * log(rate))) - log(1.0-u_min)); -} - -// Simulate repertoire evolution from full model, one event -// at a time. We return the sequence of repertoires. We -// accumulate scores here rather than introducing observe -// statements, as those observes would not be aligned. -function simulate_by_event (repertoire: Int[], eventSeq: Event[], event_index: Int, - from_age: Real, end_age: Real, mp: ModelParams, log_debt: Real) => HistoryScore { - - // End case, deal with last time segment - if event_index > length(eventSeq) { - let change_rate = total_rate(repertoire, mp); - return HistoryScore{log_score = poissonLogScore(0, - PoissonParam{rate = change_rate*(from_age-end_age)}), history = []}; - } - - // Typical case - let the_event = eventSeq[event_index]; - let rate = getRate(repertoire, the_event.host, the_event.to_state, mp); - let change_rate = total_rate(repertoire, mp); - - // Note that the first statement will capture the cases where the proposed - // history is impossible because the repertoire we are supposed to go to is - // empty (the rate is zero). - if (rate == 0.0) { - weight 0.0; - resample; + } + // Either one of the hosts failed to be sampled correctly + // or the history was invalid somewhere along the branch + return sampleBranch( + startRep, + finalRep, + startAge, + finalAge, + nhosts, + modelParams, + branchKernel, + rejectionDepth + 1 + ); + } else { + // We ran out of rejection depth for the branch + return CorrectedBranchSample { + history = [], + logDebt = -log(0.0), + logExcess = log(0.0), + success = false + }; } - let exp_log_prob = exponentialLogScore(from_age - the_event.age, ExponentialParam{rate=change_rate}); - let logScore = exp_log_prob + log(rate/change_rate); - - // Update repertoire with the change in the_event - let new_repertoire = sapplyi1(repertoire, getNewState, the_event); - let hp = HistoryPoint{age = the_event.age, repertoire = new_repertoire}; - // Simulate next event - let simHist = simulate_by_event(new_repertoire, eventSeq, event_index + 1, the_event.age, end_age, mp, log_debt); - - return HistoryScore{log_score = simHist.log_score + logScore, history = concat(simHist.history, [hp])}; } -// Update host states -function getNewState(i: Int, state: Int, event: Event) => Int { - if (i == event.host) { - return event.to_state; - } else { - return state; - } +function compAge(left: Event, right: Event) => Int { + if (isNaN(right.eventTime)) { + return -1; + } + if (isNaN(left.eventTime)) { + return 1; + } + if (right.eventTime >= left.eventTime) { + return 1; + } else { + return -1; + } } -// Compute rate for a proposed event -function getRate(repertoire: Int[], host_index: Int, to_state: Int, mp: ModelParams) => Real { - - let from_state = repertoire[host_index]; - let base_rate = mtxGet(from_state + 1, to_state + 1, mp.qMatrix); - - // Losses are easy, no cross-repertoire modification used here - if (from_state > to_state) { - - let n2s = fold(count2s, 0, repertoire); - if (from_state == 2 && n2s == 1) { - return 0.0; - } else { - return base_rate; - } - } else { - - // We have a gain rate, so we need to factor in beta-dependent cross-repertoire effect - if (from_state == 0) { - let current_hosts = whichTrue(sapply(repertoire, is1or2)); - let dist = mtxMean(mtxRowCols(mp.dMatrix, host_index, current_hosts)); - return base_rate * (exp(-(mp.beta*(dist/mp.dMean)))); - +function sampleUnorderedBranch( + startRep: Int[], + finalRep: Int[], + startAge: Real, + finalAge: Real, + hostIndex: Int, + nhosts: Int, + embeddedQMatrix: EmbeddedMarkovChainMatrix, + rejectionDepth: Int +) => BranchSample { + let _MAX_REJECTION_DEPTH_HOST = 100; + // Note that startAge > finalAge since the tree is moving forwards in time + if hostIndex <= nhosts { + let hostHistory = sampleHostHistory( + startRep[hostIndex], finalRep[hostIndex], startAge, finalAge, hostIndex, embeddedQMatrix, 1 + ); + if hostHistory.success { + let otherHostHistories = sampleUnorderedBranch( + startRep, finalRep, startAge, finalAge, hostIndex + 1, nhosts, embeddedQMatrix, 0 + ); + return BranchSample { + history = cons(hostHistory.history, otherHostHistories.history), + success = otherHostHistories.success + }; } else { - let current_hosts = whichTrue(sapply(repertoire, is2)); - let dist = mtxMean(mtxRowCols(mp.dMatrix, host_index, current_hosts)); - return base_rate * (exp(-(mp.beta*(dist/mp.dMean)))); + if rejectionDepth <= _MAX_REJECTION_DEPTH_HOST { + return sampleUnorderedBranch( + startRep, finalRep, startAge, finalAge, hostIndex, nhosts, embeddedQMatrix, rejectionDepth + 1 + ); + } else { + return BranchSample { + history = [], + success = false + }; + } } + } else { + return BranchSample { + history = [], + success = true + }; } } -// Compute total rate of change from a repertoire -function total_rate(repertoire: Int[], mp: ModelParams) => Real { - - let n1s = fold(count1s, 0, repertoire); - let totalLoss1 = Real(n1s) * mtxGet(2, 1, mp.qMatrix); - let n2s = fold(count2s, 0, repertoire); - let totalLoss2 = getLoss2(Real(n2s), mp); - let gainsSeq = sapplyi2(repertoire, gainsIf0or1, repertoire, mp); - let totalGains = seqSumReal(gainsSeq); - - return totalLoss1 + totalLoss2 + totalGains; -} +function sampleHostHistory( + startState: Int, + finalState: Int, + startAge: Real, + finalAge: Real, + host: Int, + embeddedQMatrix: EmbeddedMarkovChainMatrix, + stepIndex: Int +) => HostBranchSample { + let totalRate = embeddedQMatrix.totalRates[startState + 1]; + if startState != finalState && stepIndex == 1 { + // If there is a mutation on the interval, sample the first event from a truncated exponential + + let t = sampleTruncatedExponential(totalRate, startAge - finalAge); + let eventTime = startAge - t; + + // Sample the next event + let nextState = sampleNextEvent(startState, embeddedQMatrix); + let param = embeddedQMatrix.transitionProbs[startState + 1]; + + let restOfHistory = sampleHostHistory(nextState, finalState, eventTime, finalAge, host, embeddedQMatrix, stepIndex + 1); + return HostBranchSample { + history = cons( + Event { eventTime = eventTime, fromState = startState, toState = nextState, host = host }, + restOfHistory.history + ), + success = restOfHistory.success + }; -function getLoss2(n2s: Real, mp: ModelParams) => Real { - if n2s <= 1.0 { - let totalLoss2 = 0.0; - return totalLoss2; - } else { - let totalLoss2 = n2s * mtxGet(3, 2, mp.qMatrix); - return totalLoss2; - } -} - -function gainsIf0or1(i: Int, from_state: Int, repertoire: Int[], mp: ModelParams) => Real { - - if (from_state == 0 || from_state == 1) { - return getRate(repertoire, i, from_state + 1, mp); - } else { - return 0.0; - } -} - -// Accumulate the total log debt of proposing the -// ancestral repertoires in the tree. -function get_rep_debt(tree: HistoryTree) => Real { - - if tree is HistoryLeaf { - return tree.log_rep_debt; - } else{ - return get_rep_debt(tree.left) + get_rep_debt(tree.right); - } -} - - -/*---------------- Help functions --------------------------*/ - - -function createProbRecord(p: Matrix[Real]) => CategoricalParam { - // This version works on tests, but not here - // let ncol = dim(p)[2]; - // let ps = rep(ncol, p); - // let p_seq = zipWith(getRowVecElem, ps, 1 to ncol); - - // Hard-coded version - let p1 = getRowVecElem(p, 1); - let p2 = getRowVecElem(p, 2); - let p3 = getRowVecElem(p, 3); - let p_seq = [p1,p2,p3]; - return CategoricalParam {probs = p_seq}; -} - -function getRowVecElem(t: Matrix[Real], i: Int) => Real { - return mtxGet(1, i, t); -} - -// Calculate stationary probabilities from lambda values -function stationaryProbs(lambda: Real[]) => Matrix[Real] { - let pi_1 = 1.0 / (1.0 + (lambda[2]/lambda[1]) + (lambda[3]/lambda[4])); - let pi_0 = pi_1 * (lambda[2]/lambda[1]); - let pi_2 = 1.0 - pi_0 - pi_1; - return rvecCreate(3, [pi_0, pi_1, pi_2]); -} - -function getMsgTreeAge(tree: MsgTree) => Real { - return tree.age; -} - -function getMsgTreeMsg(tree: MsgTree) => Matrix[Real][] { - return tree.out_msg; -} - -function getProbsTreeAge(tree: ProbsTree) => Real { - return tree.age; -} - -function getProbs(tree: ProbsTree) => Matrix[Real][]{ - return tree.probs; -} - -function subAge(left: Event, right: Event) => Int { - let diff = right.age - left.age; - if (diff >= 0.0){ - return 1; } else { - return 0 - 1; // type error if I just write -1 - } -} - -function count2s(count: Int, host: Int) => Int { - if (host == 2) { - return count + 1; - } else { - return count; - } -} - -function count1s(count: Int, host: Int) => Int { - if (host == 1) { - return count + 1; - } else { - return count; - } -} + // If the final state does not equal the start state, then we don't want to + // force there to be events in the remaining interval + assume t ~ Exponential(totalRate); + if startAge - t < finalAge { + // We overshoot so we are finished, and just save the debt + if startState == finalState { + return HostBranchSample { + history = [], + success = true + }; + } else { + return HostBranchSample { + history = [], + success = false + }; + } + } else { -function is2(x: Int) => Bool { - return x == 2; + // Sample the next transition + let nextState = sampleNextEvent(startState, embeddedQMatrix); + let param = embeddedQMatrix.transitionProbs[startState + 1]; + // assume nextState ~ Categorical(param); + + // Record the event time + let eventTime = startAge - t; + let restOfHistory = sampleHostHistory(nextState, finalState, eventTime, finalAge, host, embeddedQMatrix, stepIndex + 1); + return HostBranchSample { + history = cons( + Event { eventTime = eventTime, fromState = startState, toState = nextState, host = host }, + restOfHistory.history + ), + success = restOfHistory.success + }; + } + } } -function is1or2(x: Int) => Bool { - return x == 1 || x == 2; +function sampleTruncatedExponential(rate: Real, maxT: Real) => Real { + let uMin = exp(-rate * maxT); + // Assuming directly from the truncated distribution gave numerical errors + assume u_ ~ Uniform(0.0, 1.0); + let u = uMin + u_ * (1. - uMin); + let expSample = -log(u) / rate; + return expSample; } diff --git a/inst/extdata/hostrep3states_extant_interactions.csv b/inst/extdata/hostrep3states_extant_interactions.csv deleted file mode 100644 index 9e6d14c..0000000 --- a/inst/extdata/hostrep3states_extant_interactions.csv +++ /dev/null @@ -1,5 +0,0 @@ -"","H1","H2","H3" -"S1",2,0,2 -"S2",2,0,0 -"S3",2,2,0 -"S4",0,2,0 diff --git a/inst/extdata/hostrep3states_host_tree.tre b/inst/extdata/hostrep3states_host_tree.tre deleted file mode 100644 index ec37942..0000000 --- a/inst/extdata/hostrep3states_host_tree.tre +++ /dev/null @@ -1,2 +0,0 @@ -((H1:0.1281834009,H2:0.1281834009):1.4496518,H3:1.577835201); - diff --git a/inst/extdata/hostrep3states_treeRev.tre b/inst/extdata/hostrep3states_treeRev.tre deleted file mode 100644 index 549563b..0000000 --- a/inst/extdata/hostrep3states_treeRev.tre +++ /dev/null @@ -1 +0,0 @@ -((S1[&index=4]:0.428200,S2[&index=3]:0.428200)[&index=5]:1.571800,(S3[&index=2]:0.735218,S4[&index=1]:0.735218)[&index=6]:1.264782)[&index=7]:2.000000; diff --git a/inst/extdata/tree_inference.json b/inst/extdata/tree_inference.json new file mode 100644 index 0000000..9448966 --- /dev/null +++ b/inst/extdata/tree_inference.json @@ -0,0 +1,8 @@ +{"data": +[ + [1, 1, 0, 2, 3, 0, 0, 0, 0, 3, 1, 1, 3, 3, 2], + [1, 1, 0, 2, 0, 1, 0, 0, 0, 3, 1, 0, 1, 1, 0], + [0, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 0, 3, 3, 0], + [0, 0, 1, 1, 0, 3, 0, 1, 0, 0, 2, 2, 3, 1, 0] +] +} diff --git a/inst/extdata/tree_inference.tppl b/inst/extdata/tree_inference.tppl new file mode 100644 index 0000000..38aa9b2 --- /dev/null +++ b/inst/extdata/tree_inference.tppl @@ -0,0 +1,155 @@ +// TreePPL script for tree inference under the +// Jukes Cantor model with a strict clock prior. Pruning hard-coded. + +// The nucleotide is specified as +// "A" for adenine - 0 +// "C" for cytosine - 1 +// "G" for guanine - 2 +// "T" for thymine - 3 +// "-" for gaps - 4 + +// TYPES + +type MsgTree = + | Leaf {age: Real, index: Int, msg: Matrix[Real][]} + | Node {age: Real, msg: Matrix[Real][], left: MsgTree, right: MsgTree} + +// FUNCTIONS + +// Randomly sample two indices in the trees vector, to be combined. Avoiding mirror cases. +function pickpair(n: Int) => Int[] { + assume i ~ Categorical(rep(n, 1./Real(n))); + let i = i + 1; + assume j ~ Categorical(rep((n-1), 1./Real(n-1))); + let j = j+1; + if (j < i) { return [i, j];} + else { return [i,j+1];} +} + +// Build forest of trees from leaves, recursively +function build_forest(data: Int[][], forest: MsgTree[], index: Int, data_len: Int, seq_len: Int) => MsgTree[] { + let new_message = sapply(data[index], get_leaf_message); + let new_leaf = Leaf{age = 0.0, index = index, msg = new_message}; + let new_forest = paste0([forest, [new_leaf]]); + if (data_len == index) { + return new_forest; + } + else { + return build_forest(data, new_forest, index + 1, data_len, seq_len); + } +} + +// Get message from leaves for each site +function get_leaf_message(seq: Int) => Matrix[Real] { + if (seq == 0) { // "A" + let message = rvecCreate(4, [1.0, 0.0, 0.0, 0.0]); + logWeight (log(0.25)); + return message; + } + if (seq == 1) { // "C" + let message = rvecCreate(4, [0.0, 1.0, 0.0, 0.0]); + logWeight (log(0.25)); + return message; + } + if (seq == 2) { // "G" + let message = rvecCreate(4, [0.0, 0.0, 1.0, 0.0]); + logWeight (log(0.25)); + return message; + } + if (seq == 3) { // "T" + let message = rvecCreate(4, [0.0, 0.0, 0.0, 1.0]); + logWeight (log(0.25)); + return message; + } + if (seq == 4) { // "-" + let message = rvecCreate(4, [1.0, 1.0, 1.0, 1.0]); + return message; + } + else { + return error("Invalid state at leaf"); + } +} + +//Compute log likelihood for each site +function get_log_likes(msg: Matrix[Real]) => Real { + let stationary_probs = cvecCreate(4, [0.25,0.25,0.25,0.25]); + let like = mtxMul(msg, stationary_probs); + let log_like = log(mtxGet(1, 1, like)); + return log_like; +} + +// KEY FUNCTION: CLUSTER +function cluster(q: Matrix[Real], trees: MsgTree[], maxAge: Real, seq_len: Int) => MsgTree[] { + + let n = length(trees); + + // Check if we have reached the root of the tree + if (n == 1) { + return trees; + } + + // Randomly sample two indices in the trees vector with function pickpair + // We will combine these, named left and right child + let pairs = pickpair(n); + let left_child = trees[pairs[1]]; + let right_child = trees[pairs[2]]; + + // Get the age of the new internal node + assume t ~ Exponential(10.0); + let age = maxAge + t; + + // Get incoming messages from children + let tmatrix_left = mtxTrans(mtxExp(mtxSclrMul(age-left_child.age, q))); + let tmatrix_right = mtxTrans(mtxExp(mtxSclrMul(age-right_child.age, q))); + + let left_in_msg = sapply(left_child.msg, mtxMul(_, tmatrix_left)); + let right_in_msg = sapply(right_child.msg, mtxMul(_, tmatrix_right)); + + // Get the message of the new node. + let node_msg = messageElemMul(left_in_msg, right_in_msg); + + // Weights + let log_likes = sapply(node_msg, get_log_likes); + logWeight (seqSumReal(log_likes)); + + // Remove weights of previous internal children + let log_likes_left = sapply(left_child.msg, get_log_likes); + logWeight -(seqSumReal(log_likes_left)); + let log_likes_right = sapply(right_child.msg, get_log_likes); + logWeight -(seqSumReal(log_likes_right)); + + // Manual resample + resample; + + // Combine picked pair of trees into a new node + let parent = Node{age=age, msg=node_msg, left=left_child, right=right_child}; + + // Compute new_trees list + let min = minInt(pairs[2],pairs[1]); + let max = maxInt(pairs[2],pairs[1]); + let new_trees = paste0([slice(trees, 1, min), slice(trees, (min+1), max), slice(trees, (max+1), (n+1)), [parent]]); + + // Recursive call to cluster for new_trees + return cluster(q, new_trees, age, seq_len); +} + +// MODEL FUNCTION + +model function myModel(data: Int[][]) => MsgTree[] { + // Define the scaled rate matrix for Jukes-Cantor + let q = mtxCreate(4,4, + [ -1.0, (1.0/3.0), (1.0/3.0), (1.0/3.0), + (1.0/3.0), -1.0, (1.0/3.0), (1.0/3.0), + (1.0/3.0), (1.0/3.0), -1.0, (1.0/3.0), + (1.0/3.0), (1.0/3.0), (1.0/3.0), -1.0] + ); + + let data_len = length(data); + let seq_len = length(data[1]); + + // Define the initial trees sequence (containing the leaves) + let trees = build_forest(data, [], 1, data_len, seq_len); + + // Build the tree by random clustering and return + return cluster(q, trees, 0.0, seq_len); +} diff --git a/man/tp_TreePPL_json.Rd b/man/tp_TreePPL_json.Rd deleted file mode 100644 index 2ed9bf5..0000000 --- a/man/tp_TreePPL_json.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tree_convertion.R -\name{tp_TreePPL_json} -\alias{tp_TreePPL_json} -\title{Convert a tppl_tree to TreePPL json str} -\usage{ -tp_TreePPL_json(tree) -} -\arguments{ -\item{tree}{an object of class json.} -} -\value{ -A TreePPL json str -} -\description{ -\code{tp_TreePPL_json} takes an tppl_tree create with \link{tp_phylo_2_tppl_tree} -and returns a list ready to be exported with JSON -} diff --git a/man/tp_json_to_phylo.Rd b/man/tp_json_to_phylo.Rd new file mode 100644 index 0000000..63f4c61 --- /dev/null +++ b/man/tp_json_to_phylo.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_convertion.R +\name{tp_json_to_phylo} +\alias{tp_json_to_phylo} +\title{Convert TreePPL multi-line JSON to R phylo/multiPhylo object with associated +weights} +\usage{ +tp_json_to_phylo(json_out) +} +\arguments{ +\item{json_out}{One of two options: +\itemize{ +\item A list of TreePPL output in parsed JSON format (output from +\code{\link[=tp_treeppl]{tp_treeppl()}}), OR +\item The full path of the json file containing the TreePPL output. +}} +} +\value{ +A list with two elements: +$trees: A 'phylo' object (if one tree) or 'multiPhylo' object (if multiple). +$weights: A numeric vector of sample weights. +} +\description{ +\code{tp_json_to_phylo} takes the path to a TreePPL json output and returns an +object of class \code{phylo}. +} diff --git a/man/tp_map_tree.Rd b/man/tp_map_tree.Rd new file mode 100644 index 0000000..589e2c3 --- /dev/null +++ b/man/tp_map_tree.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_treatment.R +\name{tp_map_tree} +\alias{tp_map_tree} +\title{Find the Maximum A Posteriori (MAP) Tree from weighted samples} +\usage{ +tp_map_tree(trees_out) +} +\arguments{ +\item{trees_out}{The list returned by \link{tp_json_to_phylo} +(containing $trees and $weights)} +} +\value{ +The MAP tree as a phylo object +} +\description{ +Find the Maximum A Posteriori (MAP) Tree from weighted samples +} diff --git a/man/tp_parse.Rd b/man/tp_parse.Rd index 7eccfff..da161f8 100644 --- a/man/tp_parse.Rd +++ b/man/tp_parse.Rd @@ -4,17 +4,14 @@ \alias{tp_parse} \title{Parse simple TreePPL json output} \usage{ -tp_parse(treeppl_out, n_runs = 1) +tp_parse(treeppl_out) } \arguments{ \item{treeppl_out}{a character vector giving the TreePPL json output produced by \link{tp_treeppl}.} - -\item{n_runs}{a \link[base:integer]{base::integer} giving the number of runs (MCMC) or sweeps (SMC).} } \value{ -A list (n = n_runs) of data frames with the output from inference -in TreePPL. +A data frame with the output from inference in TreePPL. } \description{ \code{tp_parse} takes TreePPL json output and returns a data.frame diff --git a/man/tp_parse_host_rep.Rd b/man/tp_parse_host_rep.Rd index 9a2f7d4..fb84097 100644 --- a/man/tp_parse_host_rep.Rd +++ b/man/tp_parse_host_rep.Rd @@ -4,13 +4,11 @@ \alias{tp_parse_host_rep} \title{Parse TreePPL json output for host repertoire model} \usage{ -tp_parse_host_rep(treeppl_out, n_runs = 1) +tp_parse_host_rep(treeppl_out) } \arguments{ \item{treeppl_out}{a character vector giving the TreePPL json output produced by \link{tp_treeppl}.} - -\item{n_runs}{a \link[base:integer]{base::integer} giving the number of runs (MCMC) or sweeps (SMC).} } \value{ A list (n = n_runs) of data frames with the output from inference diff --git a/man/tp_phylo_2_TreePPL.Rd b/man/tp_phylo_to_tpjson.Rd similarity index 56% rename from man/tp_phylo_2_TreePPL.Rd rename to man/tp_phylo_to_tpjson.Rd index 610cba8..1677cef 100644 --- a/man/tp_phylo_2_TreePPL.Rd +++ b/man/tp_phylo_to_tpjson.Rd @@ -1,26 +1,26 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tree_convertion.R -\name{tp_phylo_2_TreePPL} -\alias{tp_phylo_2_TreePPL} +\name{tp_phylo_to_tpjson} +\alias{tp_phylo_to_tpjson} \title{Convert phylo obj to TreePPL tree} \usage{ -tp_phylo_2_TreePPL(phylo_tree, age = "") +tp_phylo_to_tpjson(phylo_tree, age = "") } \arguments{ \item{phylo_tree}{an object of class \link[ape:read.tree]{ape::phylo}.} -\item{age}{a string that determine the way the age of the node are +\item{age}{a string that determines the way the age of the nodes are calculated (default "branch-length"). \itemize{ \item "branch-length" : Root's age = NA, branch-length from the parent branch -\item "top-down" : Root's age = 0.0, cumulative branch-length from root -\item "down-top" : Leaf's age = 0.0, cumulative branch-length from leaf +\item "root-to-tip" : Root's age = 0.0, cumulative branch-length from root +\item "tip-to-root" : Leaf's age = 0.0, cumulative branch-length from leaf }} } \value{ A TreePPL json str } \description{ -\code{tp_phylo_2_TreePPL} takes an object of class \code{phylo} and returns +\code{tp_phylo_to_tpjson} takes an object of class \code{phylo} and returns a TreePPL json str ready to print in a data file for TreePPL use. } diff --git a/man/tp_phylo_2_tppl_tree.Rd b/man/tp_phylo_to_tppl_tree.Rd similarity index 63% rename from man/tp_phylo_2_tppl_tree.Rd rename to man/tp_phylo_to_tppl_tree.Rd index 77b7d5c..57e25ef 100644 --- a/man/tp_phylo_2_tppl_tree.Rd +++ b/man/tp_phylo_to_tppl_tree.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tree_convertion.R -\name{tp_phylo_2_tppl_tree} -\alias{tp_phylo_2_tppl_tree} +\name{tp_phylo_to_tppl_tree} +\alias{tp_phylo_to_tppl_tree} \title{Convert phylo to a tppl_tree} \usage{ -tp_phylo_2_tppl_tree(phylo_tree) +tp_phylo_to_tppl_tree(phylo_tree) } \arguments{ \item{phylo_tree}{an object of class \link[ape:read.tree]{ape::phylo}.} @@ -13,6 +13,6 @@ tp_phylo_2_tppl_tree(phylo_tree) A pair (root index, tppl_tree) } \description{ -\code{tp_phylo_2_tppl_tree} takes an object of class \code{phylo} and returns +\code{tp_phylo_to_tppl_tree} takes an object of class \code{phylo} and returns a tppl_tree. } diff --git a/man/tp_run.Rd b/man/tp_run.Rd index 511c328..cfc0852 100644 --- a/man/tp_run.Rd +++ b/man/tp_run.Rd @@ -7,7 +7,7 @@ tp_run( model_file_name = "tmp_model_file", data_file_name = "tmp_data_file", - n_runs = "1" + n_runs = 1 ) } \arguments{ @@ -15,10 +15,10 @@ tp_run( \item{data_file_name}{a character vector giving a data name.} -\item{n_runs}{a \link[base:integer]{base::integer} giving the number of run (mcmc)/sweap (smc).} +\item{n_runs}{a \link[base:integer]{base::integer} giving the number of runs (mcmc)/sweaps (smc).} } \value{ -TreePPL output in JSON format. +A list of TreePPL output in parsed JSON format. } \description{ \code{tp_treeppl} execute TreePPL and return TreePPL output (string JSON format). @@ -34,7 +34,5 @@ write your model with \link{tp_treeppl}. a data name. Use a \link{tp_stored_data} name if you have already write your data with \link{tp_treeppl}. -\code{samples} : The number of samples (mcmc) / particules (smc) during inference. - \code{n_runs} : The number of run (mcmc) / sweap (smc) used for the inference. } diff --git a/man/tp_smc_convergence.Rd b/man/tp_smc_convergence.Rd new file mode 100644 index 0000000..732c60c --- /dev/null +++ b/man/tp_smc_convergence.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/post_treatment.R +\name{tp_smc_convergence} +\alias{tp_smc_convergence} +\title{Check for convergence across multiple SMC sweeps/runs} +\usage{ +tp_smc_convergence(treeppl_out) +} +\arguments{ +\item{treeppl_out}{a character vector giving the TreePPL json output +produced by \link{tp_treeppl}.} +} +\value{ +Variance in the normalizing constants across SMC sweeps. +} +\description{ +Check for convergence across multiple SMC sweeps/runs +} diff --git a/man/tp_treeppl.Rd b/man/tp_treeppl.Rd index 4360e8d..f43a9c7 100644 --- a/man/tp_treeppl.Rd +++ b/man/tp_treeppl.Rd @@ -70,7 +70,7 @@ posterior distribution.} method} } \value{ -TreePPL output in JSON format. +A list of TreePPL output in parsed JSON format. } \description{ \code{tp_treeppl} execute TreePPL and return TreePPL output (string JSON format). diff --git a/man/tree_age_cumul.Rd b/man/tree_age_cumul.Rd index c8b1f9e..7835d47 100644 --- a/man/tree_age_cumul.Rd +++ b/man/tree_age_cumul.Rd @@ -7,15 +7,15 @@ tree_age_cumul(tree, root_index, age = "branch-length") } \arguments{ -\item{tree}{an tppl_tree create with \link{tp_phylo_2_tppl_tree}.} +\item{tree}{an tppl_tree create with \link{tp_phylo_to_tppl_tree}.} \item{root_index}{the index of the root in the tppl_tree.} \item{age}{a string that determine the way the age of the node are calculated (default "branch-length"). \itemize{ -\item "top-down" : Root's age = 0.0, cumulative branch-length from root -\item "down-top" : Leaf's age = 0.0, cumulative branch-length from leaf +\item "root-to-tip" : Root's age = 0.0, cumulative branch-length from root +\item "tip-to-root" : Leaf's age = 0.0, cumulative branch-length from leaf }} } \value{ diff --git a/tests/testthat/test-tree_convertion.R b/tests/testthat/test-tree_convertion.R index 0b14564..795498b 100644 --- a/tests/testthat/test-tree_convertion.R +++ b/tests/testthat/test-tree_convertion.R @@ -15,7 +15,7 @@ ex_tree <- data.frame( ) testthat::test_that("Test-json_1a : tree_age_cumul", { - cat("\tTest-json_1a : tree_age_cumul with top-down\n") + cat("\tTest-json_1a : tree_age_cumul with root-to-tip\n") top_down_tree <- data.frame( Type = c("Leaf", "Leaf", "Leaf", "Leaf", "Leaf", "Node","Node","Node","Node"), @@ -25,13 +25,13 @@ testthat::test_that("Test-json_1a : tree_age_cumul", { Right = c(NA,NA,NA,NA,NA,8,2,9,5) ) - res_tree <- treepplr:::tree_age_cumul(ex_tree, 6, age = "top-down") + res_tree <- treepplr:::tree_age_cumul(ex_tree, 6, age = "root-to-tip") testthat::expect_equal(top_down_tree, res_tree) }) testthat::test_that("Test-json_1b : tree_age_cumul", { - cat("\tTest-json_1b : tree_age_cumul with down-top\n") + cat("\tTest-json_1b : tree_age_cumul with tip-to-root\n") down_top_tree <- data.frame( Type = c("Leaf", "Leaf", "Leaf", "Leaf", "Leaf", "Node","Node","Node","Node"), @@ -41,17 +41,17 @@ testthat::test_that("Test-json_1b : tree_age_cumul", { Right = c(NA,NA,NA,NA,NA,8,2,9,5) ) - res_tree <- treepplr:::tree_age_cumul(ex_tree, 6, age = "down-top") + res_tree <- treepplr:::tree_age_cumul(ex_tree, 6, age = "tip-to-root") testthat::expect_equal(down_top_tree, res_tree) }) -testthat::test_that("Test-json_2 : tp_phylo_2_tppl_tree", { - cat("\tTest-json_2 : tp_phylo_2_tppl_tree with phylo\n") +testthat::test_that("Test-json_2 : tp_phylo_to_tppl_tree", { + cat("\tTest-json_2 : tp_phylo_to_tppl_tree with phylo\n") ex_tree_phylo <- ape::read.tree(text="((1:1,2:1)7:5,(3:4,(4:1.5,5:1.5)9:2.5)8:2)6;") - res_tree <- treepplr:::tp_phylo_2_tppl_tree(ex_tree_phylo) + res_tree <- treepplr:::tp_phylo_to_tppl_tree(ex_tree_phylo) testthat::expect_equal(6, res_tree[[1]]) testthat::expect_equal(ex_tree, res_tree[[2]]) diff --git a/vignettes/coin-example.Rmd b/vignettes/coin-example.Rmd index 693d9aa..c356a04 100644 --- a/vignettes/coin-example.Rmd +++ b/vignettes/coin-example.Rmd @@ -16,6 +16,8 @@ knitr::opts_chunk$set( warning = FALSE, message = FALSE ) + +options(rmarkdown.html_vignette.check_title = FALSE) ``` ```{r setup} @@ -41,10 +43,10 @@ str(data) ## Run TreePPL -Now we can compile and run the TreePPL program. The function *tp_treeppl()* has many optional arguments to change the inference method used. Here, we will use the default settings and only pass the model and the data. +Now we can compile and run the TreePPL program. The function *tp_treeppl()* has many optional arguments to change the inference method used. Here, we will use the default settings for inference methods, run 100 sweeps and 100 particles within each sweep. ```{r, eval=FALSE} -output_list <- tp_treeppl(model = model, data = data) +output_list <- tp_treeppl(model = model, data = data, samples = 100, n_runs = 100) ``` ```{r, echo=FALSE} @@ -53,18 +55,22 @@ output_list <- readRDS("rdata/coin/output_coin.rds") ## Plot the posterior distribution -TreePPL outputs the log weight of each sample, so first we need to get the normalized weights and then we can plot the posterior distribution produced. +Just as we compare different MCMC runs/chains to test for convergence, in SMC we +tun several sweeps and then compare their normalizing constants to test if they +converged to the same posterior distribution. The criterion for convergence is +that the variance of the normalizing constants across all sweeps is lower than 1. ```{r, fig.height=5, fig.width=5} -# turn list into a data frame where each row represents one sample -# and calculate normalized weights from log weights -output <- tp_parse(output_list) %>% - dplyr::mutate(weight = exp(log_weight - max(.$log_weight))) +tp_smc_convergence(output_list) -ggplot(output, aes(samples, weight=weight)) + - geom_histogram(aes(y = after_stat(density)), - col = "white", fill = "lightblue", binwidth=0.04) + - geom_density() + - theme_bw() +output <- tp_parse(output_list) %>% + dplyr::mutate(total_lweight = log_weight + norm_const) %>% + dplyr::mutate(norm_weight = exp(total_lweight - max(.$total_lweight))) + +ggplot2::ggplot(output, ggplot2::aes(samples, weight = norm_weight)) + + ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), + col = "white", fill = "lightblue", binwidth=0.01) + + ggplot2::geom_density() + + ggplot2::theme_bw() ``` diff --git a/vignettes/constant-rate-birth.Rmd b/vignettes/constant-rate-birth.Rmd index c0a7867..b4d8a0a 100644 --- a/vignettes/constant-rate-birth.Rmd +++ b/vignettes/constant-rate-birth.Rmd @@ -16,6 +16,8 @@ knitr::opts_chunk$set( warning = FALSE, message = FALSE ) + +options(rmarkdown.html_vignette.check_title = FALSE) ``` ```{r setup} @@ -31,15 +33,16 @@ library(magrittr) We will use an example (random) tree that comes with the package. ```{r, fig.width=6, fig.height=4} -tree <- read.tree(system.file( +tree <- ape::read.tree(system.file( "extdata/crb_tree_15_tips.tre", package = "treepplr")) -plot.phylo(tree, cex = 0.5) +ape::plot.phylo(tree, cex = 0.5) +axisPhylo() ``` We need to convert the tree to a **TreePPL** readable format and read the CRB model. ```{r} -data <- tp_phylo_2_TreePPL(tree, age="down-top") +data <- tp_phylo_to_tpjson(tree, age="tip-to-root") model <- tp_model(system.file("extdata/crb.tppl", package = "treepplr")) ``` @@ -59,19 +62,21 @@ output_list <- readRDS("rdata/crb/output_crb.rds") ## Plot posterior -TreePPL outputs the log weight of each sample, so first we need to get the normalized weights and then we can plot the posterior distribution produced. +TreePPL outputs the log weight of each sample, so first we need to get the +normalized weights and then we can plot the posterior distribution produced. ```{r, fig.height=4, fig.width=6} # turn list into a data frame where each row represents one sample -# and calculate normalized weights from log weights +# and calculate normalized weights from log weights and normalizing constants. output <- tp_parse(output_list) %>% - dplyr::mutate(weight = exp(log_weight - max(.$log_weight))) + dplyr::mutate(total_lweight = log_weight + norm_const) %>% + dplyr::mutate(norm_weight = exp(total_lweight - max(.$total_lweight))) -ggplot(output, aes(samples, weight=weight)) + - geom_histogram(aes(y = after_stat(density)), +ggplot2::ggplot(output, ggplot2::aes(samples, weight = norm_weight)) + + ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), col = "white", fill = "lightblue", binwidth=0.04) + - geom_density() + - theme_bw() + ggplot2::geom_density() + + ggplot2::theme_bw() ``` diff --git a/vignettes/hostrep-example.Rmd b/vignettes/hostrep-example.Rmd index 6368921..91f7efe 100644 --- a/vignettes/hostrep-example.Rmd +++ b/vignettes/hostrep-example.Rmd @@ -55,97 +55,98 @@ data <- tp_data("hostrep3states") ## Run treeppl -Now we can compile and run the TreePPL program using the default inference options. - -```{r, eval=FALSE} -output_list <- tp_treeppl(model = model, model_file_name = "hostrep", - data = data, data_file_name = "hostrep") -``` - -```{r, echo=FALSE} -output_list <- readRDS("rdata/hostrep/output_hostrep3states.rds") -``` - -```{r} -output <- tp_parse_host_rep(output_list) -``` - -The output from *tp_parse_host_rep()* contains all information outputted by treeppl, so we need to separate the sampled character histories from the sampled parameter values. - -## Parameter estimates - -```{r} -logs <- output[[1]] %>% - dplyr::select(iteration, log_weight, mu, beta) %>% - unique() %>% - dplyr::mutate(weight = exp(log_weight - max(.$log_weight))) - -ggplot(logs, aes(mu, weight=weight)) + - geom_histogram(aes(y = after_stat(density)), col = "white", fill = "lightblue", binwidth=0.04) + - geom_density() + - theme_bw() - -ggplot(logs, aes(mu, weight=weight)) + - geom_histogram(aes(y = after_stat(density) ), col = "white", fill = "lightblue", binwidth=0.04) + - geom_density() + - theme_bw() -``` - -## Character history - -Extract table with character history samples - -```{r} -# post-treatment function -get_history <- function(parsed_output){ - - table <- parsed_output[[1]] %>% - dplyr::select(-c(log_weight, mu, beta, lambda1, lambda2, lambda3, lambda4)) %>% - dplyr::mutate(transition_type = "anagenetic") %>% - dplyr::mutate(node_index = dplyr::case_when( # fix to mismatch and 0- to 1-base - node_index == 3 ~ 5, - node_index == 4 ~ 4, - TRUE ~ node_index + 1)) %>% - dplyr::filter(!is.na(log_norm_const)) - - return(table) -} - -tp_hist <- get_history(output) -``` - - -This table can be used by **evolnets** for plotting results, together with the phylogenetic trees and the known extant interactions. - -```{r} -# get data from treepplr -symbiont_tree <- read_tree_from_revbayes( - system.file("extdata/hostrep3states_treeRev.tre", package = "treepplr")) -host_tree <- read.tree( - system.file("extdata/hostrep3states_host_tree.tre", package = "treepplr")) -matrix <- read.csv( - system.file("extdata/hostrep3states_extant_interactions.csv", package = "treepplr"), - row.names = 1) %>% - as.matrix() -``` - -### Plot data and inferred ancestral fundamental host repertoires - -```{r, fig.height=4, fig.width=6} -# calculate posterior at nodes -tp_at_nodes <- posterior_at_nodes(tp_hist, symbiont_tree, host_tree, state = c(1,2)) - -#plot -(tp_asr_fund <- plot_matrix_phylo(matrix, tp_at_nodes, symbiont_tree, - host_tree, type = "repertoires", - repertoire = "fundamental")) -``` - -### Plot data and inferred ancestral realized host repertoires - -```{r, fig.height=4, fig.width=6} -(tp_asr_real <- plot_matrix_phylo(matrix, tp_at_nodes, symbiont_tree, - host_tree, type = "repertoires", - repertoire = "realized")) -``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/vignettes/rdata/coin/output_coin.rds b/vignettes/rdata/coin/output_coin.rds index a9da720..c59473f 100644 Binary files a/vignettes/rdata/coin/output_coin.rds and b/vignettes/rdata/coin/output_coin.rds differ diff --git a/vignettes/rdata/crb/output_crb.rds b/vignettes/rdata/crb/output_crb.rds index bbfa448..8667349 100644 Binary files a/vignettes/rdata/crb/output_crb.rds and b/vignettes/rdata/crb/output_crb.rds differ diff --git a/vignettes/rdata/tree_inference/output_tree_inference.rds b/vignettes/rdata/tree_inference/output_tree_inference.rds new file mode 100644 index 0000000..e6222b1 Binary files /dev/null and b/vignettes/rdata/tree_inference/output_tree_inference.rds differ diff --git a/vignettes/tree_inference.Rmd b/vignettes/tree_inference.Rmd new file mode 100644 index 0000000..aa95c00 --- /dev/null +++ b/vignettes/tree_inference.Rmd @@ -0,0 +1,90 @@ +--- +title: "Tree inference" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{tree_inference} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = FALSE +) + +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +```{r setup} +library(treepplr) +library(dplyr) +library(ggplot2) +library(ape) +``` + +## Load model and data files + +Load the tree inference model and example data available within `treepplr`. + +```{r} +model <- tp_model("tree_inference") +data <- tp_data("tree_inference") +``` + +The data in this example is a toy dataset ... + +```{r} +str(data) +``` + +## Run TreePPL + +Now we can compile and run the TreePPL program. The function `tp_treeppl()` has +many optional arguments to change the inference method used. Here, we will use + +```{r, eval=FALSE} +output_list <- tp_treeppl(model = model, data = data, method = "smc-apf", + samples = 1000, subsample = 5, resample = "manual", + n_runs = 10) +``` + +```{r, echo=FALSE} +output_list <- readRDS("rdata/tree_inference/output_tree_inference.rds") +``` + +The result is a list of sweeps, each one containing 3 elements: the sampled +parameter values and the weights (in log scale) for each of the 5 particles in +each sweep, and the normalizing constant for the whole sweep. + +```{r} +str(output_list,max.level = 2) +``` + +## Plot the posterior distribution + +In this example we will check that the different runs converged to the same posterior, +by checking the variance in the normalizing constant among runs. + +```{r, fig.height=5, fig.width=5} +tp_smc_convergence(output_list) +``` + +There are different ways to summarize the posterior distribution of trees. In +this examples, we calculate the the MAP (Maximum A Posteriori) tree. We do this +by finding the single tree topology that has the highest posterior probability, +and then using `ape::consensus` to compute average branch lengths for all sampled +trees with the MAP topology. + +```{r, fig.height=4, fig.width=5} +out_trees <- tp_json_to_phylo(output_list) + +map_tree <- tp_map_tree(out_trees) +plot(map_tree) +axisPhylo() +``` +