From 5a8d630597cd1a6332be7879deac457314c8b932 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Mon, 16 Jun 2025 15:18:54 +0100 Subject: [PATCH 1/5] add a scaled confidence threshold for dehosting to test --- include/classify_stats.hpp | 6 ++++++ include/dehost_arguments.hpp | 2 ++ include/read_entry.hpp | 6 ++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/classify_stats.hpp b/include/classify_stats.hpp index e93f2ca..ea567b3 100644 --- a/include/classify_stats.hpp +++ b/include/classify_stats.hpp @@ -401,6 +401,7 @@ class StatsModel { uint32_t min_length_; float min_compression_; int8_t confidence_threshold_; + float confidence_probability_threshold_; uint8_t min_hits_; float host_unique_prop_lo_threshold_; float min_proportion_difference_; @@ -444,6 +445,7 @@ class StatsModel { min_length_(opt.min_length), min_compression_(opt.min_compression), confidence_threshold_(opt.confidence_threshold), + confidence_probability_threshold_(opt.confidence_probability_threshold), host_unique_prop_lo_threshold_(opt.host_unique_prop_lo_threshold), min_proportion_difference_(opt.min_proportion_difference), min_prob_difference_(opt.min_prob_difference){ @@ -496,6 +498,10 @@ class StatsModel { return confidence_threshold_; } + float confidence_probability_threshold() const { + return confidence_probability_threshold_; + } + uint8_t min_num_hits() const { return min_hits_; } diff --git a/include/dehost_arguments.hpp b/include/dehost_arguments.hpp index 26412b1..fbd0b6c 100644 --- a/include/dehost_arguments.hpp +++ b/include/dehost_arguments.hpp @@ -31,6 +31,7 @@ struct DehostArguments { uint32_t min_length { 140 }; float min_compression {0}; uint8_t confidence_threshold{2}; + float confidence_probability_threshold{5}; float host_unique_prop_lo_threshold{ 0.05 }; float min_proportion_difference { 0.04 }; float min_prob_difference{ 0 }; @@ -63,6 +64,7 @@ struct DehostArguments { ss += "\tmin_quality:\t\t\t" + std::to_string(min_quality) + "\n"; ss += "\tmin_compression:\t\t" + std::to_string(min_compression) + "\n"; ss += "\tconfidence_threshold:\t\t" + std::to_string(confidence_threshold) + "\n"; + ss += "\tconfidence_probability_threshold:\t\t" + std::to_string(confidence_probability_threshold) + "\n"; ss += "\thost_unique_prop_lo_threshold:\t" + std::to_string(host_unique_prop_lo_threshold) + "\n"; ss += "\tmin_proportion_difference:\t" + std::to_string(min_proportion_difference) + "\n"; ss += "\tmin_prob_difference:\t\t" + std::to_string(min_prob_difference) + "\n\n"; diff --git a/include/read_entry.hpp b/include/read_entry.hpp index 6950029..c645823 100644 --- a/include/read_entry.hpp +++ b/include/read_entry.hpp @@ -270,13 +270,15 @@ class ReadEntry { if (host_unique_prop > other_unique_prop and host_unique_prop - other_unique_prop > stats_model.min_proportion_difference() and host_prob > other_prob - and host_prob - other_prob > stats_model.min_prob_difference()) + and host_prob - other_prob > stats_model.min_prob_difference() + and std::max(host_prob * confidence_score_,static_cast(confidence_score_)) >= stats_model.confidence_probability_threshold()) call_ = host_index; else if (host_unique_prop < stats_model.host_unique_prop_lo_threshold() and host_unique_prop < other_unique_prop and other_unique_prop - host_unique_prop > stats_model.min_proportion_difference() and host_prob < other_prob - and other_prob - host_prob > stats_model.min_prob_difference()) + and other_prob - host_prob > stats_model.min_prob_difference() + and std::max(other_prob * confidence_score_,static_cast(confidence_score_)) >= stats_model.confidence_probability_threshold()) call_ = other_index; } From 6e12126d751dafb06f18379ea0d356e5e318c25a Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Fri, 20 Jun 2025 11:06:02 +0100 Subject: [PATCH 2/5] avoid segfault by reducing from max value --- include/index_arguments.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/index_arguments.hpp b/include/index_arguments.hpp index 0867ab6..b02d1c8 100644 --- a/include/index_arguments.hpp +++ b/include/index_arguments.hpp @@ -17,7 +17,7 @@ struct IndexArguments { uint8_t kmer_size { 19 }; // IBF options - mutable size_t bits {std::numeric_limits::max()}; // Allow to change bits for each partition + mutable size_t bits {std::numeric_limits::max()-2}; // Allow to change bits for each partition uint8_t num_hash {3}; double max_fpr {0.01}; From b148792d013cfb0c652246496ae68d5d35b4ef01 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Mon, 23 Jun 2025 12:32:40 +0100 Subject: [PATCH 3/5] enable classify to work for more than 2 categories --- include/classify_arguments.hpp | 6 +- include/classify_stats.hpp | 1 - include/read_entry.hpp | 136 ++++++++++++-------------- src/classify_main.cpp | 169 +++++++++++++++++++-------------- 4 files changed, 160 insertions(+), 152 deletions(-) diff --git a/include/classify_arguments.hpp b/include/classify_arguments.hpp index 8b9e6d3..994e8e0 100644 --- a/include/classify_arguments.hpp +++ b/include/classify_arguments.hpp @@ -25,8 +25,7 @@ struct ClassifyArguments { float min_quality { 10.0 }; uint32_t min_length { 140 }; float min_compression {0.15}; - uint8_t confidence_threshold{0}; - uint8_t min_hits{2}; + uint8_t confidence_threshold{2}; float min_proportion_difference { 0.00 }; // Output options @@ -58,8 +57,7 @@ struct ClassifyArguments { ss += "\tmin_quality:\t\t" + std::to_string(min_quality) + "\n"; ss += "\tmin_compression:\t\t" + std::to_string(min_compression) + "\n"; ss += "\tconfidence_threshold:\t" + std::to_string(confidence_threshold) + "\n"; - ss += "\tmin_hits:\t\t" + std::to_string(min_hits) + "\n"; - ss += "\tmin_diff:\t\t" + std::to_string(min_proportion_difference) + "\n\n"; + ss += "\tmin_proportion_diff:\t\t" + std::to_string(min_proportion_difference) + "\n\n"; ss += "\tcategory_to_extract:\t" + category_to_extract + "\n"; ss += "\tprefix:\t" + prefix + "\n\n"; diff --git a/include/classify_stats.hpp b/include/classify_stats.hpp index ea567b3..12be7cd 100644 --- a/include/classify_stats.hpp +++ b/include/classify_stats.hpp @@ -430,7 +430,6 @@ class StatsModel { min_length_(opt.min_length), min_compression_(opt.min_compression), confidence_threshold_(opt.confidence_threshold), - min_hits_(opt.min_hits), min_proportion_difference_(opt.min_proportion_difference) { for (auto i = 0; i < summary.num_categories(); ++i) { models_.emplace_back(Model(i, opt.dist)); diff --git a/include/read_entry.hpp b/include/read_entry.hpp index c645823..5889891 100644 --- a/include/read_entry.hpp +++ b/include/read_entry.hpp @@ -21,7 +21,7 @@ class ReadEntry { float compression_; uint32_t num_hashes_{0}; - std::vector::membership_agent_type::binning_bitvector> bits_;// this collects over all bins + std::vector::membership_agent_type::binning_bitvector> bits_;// this collects over all bins std::unordered_map> max_bits_; // this summarizes over categories (which may have multiple bins) std::vector counts_; std::vector unique_counts_; @@ -43,17 +43,17 @@ class ReadEntry { ~ReadEntry() = default; - ReadEntry(const std::string& read_id, const uint32_t& length, const float& mean_quality, const float& compression, const InputSummary &summary) : + ReadEntry(const std::string &read_id, const uint32_t &length, const float &mean_quality, const float &compression, + const InputSummary &summary) : read_id_(read_id), length_(length), mean_quality_(mean_quality), compression_(compression), counts_(summary.num_categories(), 0), - proportions_(summary.num_categories(),0), - unique_proportions_(summary.num_categories(),0), + proportions_(summary.num_categories(), 0), + unique_proportions_(summary.num_categories(), 0), unique_counts_(summary.num_categories(), 0), - probabilities_(summary.num_categories(),1) - { + probabilities_(summary.num_categories(), 1) { PLOG_DEBUG << "Initialize entry with read_id " << read_id << " and length " << length; bits_.reserve(length); @@ -63,15 +63,15 @@ class ReadEntry { PLOG_VERBOSE << "Initializing complete for read_id " << read_id; } - const std::string& read_id() const { + const std::string &read_id() const { return read_id_; } - const std::vector& proportions() const { + const std::vector &proportions() const { return proportions_; } - const std::vector& unique_proportions() const { + const std::vector &unique_proportions() const { return unique_proportions_; } @@ -94,20 +94,19 @@ class ReadEntry { // get totals in each bin seqan3::counting_vector total_bits_per_bin(summary.num_bins, 0); - for (const auto & entry : bits_) - { + for (const auto &entry: bits_) { total_bits_per_bin += entry; } PLOG_DEBUG << "Have the following total hits per bin: " << total_bits_per_bin; // identify max bin per category - std::vector index_per_category(summary.num_categories(),std::numeric_limits::max()); - for ( auto bin=0; bin < total_bits_per_bin.size(); ++bin) - { - const auto & bits_in_bin = total_bits_per_bin[bin]; - const auto & category = summary.bin_to_category.at(bin); - const auto & index = summary.category_index(category); - if (index_per_category.at(index) == std::numeric_limits::max() or total_bits_per_bin[bin] > total_bits_per_bin[index_per_category[index]]) { + std::vector index_per_category(summary.num_categories(), std::numeric_limits::max()); + for (auto bin = 0; bin < total_bits_per_bin.size(); ++bin) { + const auto &bits_in_bin = total_bits_per_bin[bin]; + const auto &category = summary.bin_to_category.at(bin); + const auto &index = summary.category_index(category); + if (index_per_category.at(index) == std::numeric_limits::max() or + total_bits_per_bin[bin] > total_bits_per_bin[index_per_category[index]]) { index_per_category.at(index) = bin; counts_.at(index) = total_bits_per_bin[bin]; PLOG_DEBUG << "Category " << category << " has max index " << +bin; @@ -121,14 +120,12 @@ class ReadEntry { // collect together the bitvector for max bits and the unique_counts std::vector found; - for (const auto & entry : bits_){ + for (const auto &entry: bits_) { found.clear(); - for (auto category=0; category < index_per_category.size(); ++category) - { - const auto & category_index = index_per_category[category]; + for (auto category = 0; category < index_per_category.size(); ++category) { + const auto &category_index = index_per_category[category]; max_bits_.at(category).emplace_back(entry[category_index]); - if (entry[category_index] == 1) - { + if (entry[category_index] == 1) { found.push_back(category); } } @@ -158,31 +155,22 @@ class ReadEntry { } void call_category(const StatsModel &stats_model) { - //TODO extend this to work with more than 2 categories - assert(probabilities_.size() == 2); - - double first = probabilities_.at(0); + // We want to compare based on unique_counts_ because confidence is defined based on the difference of the + // top 2 unique_counts_. However, we will reject the result if the probabilities do not support this ordering. uint8_t first_pos = 0; - double second = probabilities_.at(1); uint8_t second_pos = 1; - if (unique_counts_.at(second_pos) > unique_counts_.at(first_pos)) - { - std::swap(first, second); + if (unique_counts_.at(second_pos) > unique_counts_.at(first_pos)) { std::swap(first_pos, second_pos); } - /*for (auto i = 0; i < probabilities_.size(); ++i) { - const double &val = probabilities_.at(i); - if (val >= second) { - second = val; + for (auto i = 2; i < unique_counts_.size(); ++i) { + if (unique_counts_.at(i) > unique_counts_.at(second_pos)) { second_pos = i; - if (second >= first) - { - std::swap(first, second); + if (unique_counts_.at(second_pos) > unique_counts_.at(first_pos)) { std::swap(first_pos, second_pos); } } - }*/ + } auto raw_confidence = unique_counts_.at(first_pos) - unique_counts_.at(second_pos); if (raw_confidence > std::numeric_limits::max()) @@ -190,49 +178,46 @@ class ReadEntry { else confidence_score_ = static_cast(raw_confidence); - if (mean_quality_ < stats_model.min_quality()){ + if (mean_quality_ < stats_model.min_quality()) { return; } - if (length_ < stats_model.min_length()){ + if (length_ < stats_model.min_length()) { return; } - if (compression_ < stats_model.min_compression()){ + if (compression_ < stats_model.min_compression()) { return; } - if (second == 0 and first > 0) { + if (probabilities_.at(second_pos) == 0 and probabilities_.at(first_pos) > 0) { call_ = first_pos; } else { - /*auto ratio = first/second; - PLOG_VERBOSE << "confidence score " << ratio << " from " << first << "/" << second; - if (ratio < std::numeric_limits::max()) - confidence_score_ = static_cast(ratio); - else - confidence_score_ = std::numeric_limits::max();*/ - if (confidence_score_ > stats_model.confidence_threshold() and first > second) + if (confidence_score_ > stats_model.confidence_threshold() and + probabilities_.at(first_pos) > probabilities_.at(second_pos)) call_ = first_pos; } - const auto & first_count = counts_.at(first_pos); - const auto & second_count = counts_.at(second_pos); - if (second_count > first_count or first_count - second_count < stats_model.min_num_hits()){ - PLOG_DEBUG << read_id_ << " has first count " << +first_count << " and second count " << +second_count << " which have differences less than " << +stats_model.min_num_hits(); + const auto &first_count = counts_.at(first_pos); + const auto &second_count = counts_.at(second_pos); + if (second_count > first_count or first_count - second_count < stats_model.min_num_hits()) { + PLOG_DEBUG << read_id_ << " has first count " << +first_count << " and second count " << +second_count + << " which have differences less than " << +stats_model.min_num_hits(); call_ = std::numeric_limits::max(); // if we don't see at least this number of hits difference, then no call } - const auto & first_prop = proportions_.at(first_pos); - const auto & second_prop = proportions_.at(second_pos); - if (second_prop > first_prop or first_prop - second_prop < stats_model.min_proportion_difference()){ - PLOG_DEBUG << read_id_ << " has first proportion " << first_prop << " and second proportion " << second_prop << " which have differences less than " << stats_model.min_proportion_difference(); + const auto &first_prop = proportions_.at(first_pos); + const auto &second_prop = proportions_.at(second_pos); + if (second_prop > first_prop or first_prop - second_prop < stats_model.min_proportion_difference()) { + PLOG_DEBUG << read_id_ << " has first proportion " << first_prop << " and second proportion " << second_prop + << " which have differences less than " << stats_model.min_proportion_difference(); call_ = std::numeric_limits::max(); // if we don't see at least this level of discrepancy between the proportion hitting against each database don't call } } void call_host(const StatsModel &stats_model, const uint8_t host_index) { assert(probabilities_.size() == 2); - const uint8_t other_index = 1-host_index; + const uint8_t other_index = 1 - host_index; double host_unique_prop = unique_proportions_.at(host_index); double other_unique_prop = unique_proportions_.at(other_index); @@ -241,8 +226,7 @@ class ReadEntry { auto first_pos = host_index; auto second_pos = other_index; - if (host_unique_prop < other_unique_prop) - { + if (host_unique_prop < other_unique_prop) { first_pos = other_index; second_pos = host_index; } @@ -251,19 +235,19 @@ class ReadEntry { confidence_score_ = std::numeric_limits::max(); else confidence_score_ = static_cast(raw_confidence); - if (confidence_score_ < stats_model.confidence_threshold()){ + if (confidence_score_ < stats_model.confidence_threshold()) { return; } - if (mean_quality_ < stats_model.min_quality()){ + if (mean_quality_ < stats_model.min_quality()) { return; } - if (length_ < stats_model.min_length()){ + if (length_ < stats_model.min_length()) { return; } - if (compression_ < stats_model.min_compression()){ + if (compression_ < stats_model.min_compression()) { return; } @@ -271,14 +255,16 @@ class ReadEntry { and host_unique_prop - other_unique_prop > stats_model.min_proportion_difference() and host_prob > other_prob and host_prob - other_prob > stats_model.min_prob_difference() - and std::max(host_prob * confidence_score_,static_cast(confidence_score_)) >= stats_model.confidence_probability_threshold()) + and std::max(host_prob * confidence_score_, static_cast(confidence_score_)) >= + stats_model.confidence_probability_threshold()) call_ = host_index; else if (host_unique_prop < stats_model.host_unique_prop_lo_threshold() - and host_unique_prop < other_unique_prop - and other_unique_prop - host_unique_prop > stats_model.min_proportion_difference() - and host_prob < other_prob - and other_prob - host_prob > stats_model.min_prob_difference() - and std::max(other_prob * confidence_score_,static_cast(confidence_score_)) >= stats_model.confidence_probability_threshold()) + and host_unique_prop < other_unique_prop + and other_unique_prop - host_unique_prop > stats_model.min_proportion_difference() + and host_prob < other_prob + and other_prob - host_prob > stats_model.min_prob_difference() + and std::max(other_prob * confidence_score_, static_cast(confidence_score_)) >= + stats_model.confidence_probability_threshold()) call_ = other_index; } @@ -305,7 +291,8 @@ class ReadEntry { } void print_result(const InputSummary &summary) { - std::cout << read_id_ << "\t" << num_hashes_ << "\t" << summary.category_name(call_) << "\t" << +confidence_score_ << "\t" ; + std::cout << read_id_ << "\t" << num_hashes_ << "\t" << summary.category_name(call_) << "\t" + << +confidence_score_ << "\t"; for (auto i = 0; i < summary.num_categories(); i++) { std::cout << summary.categories.at(i) << ":" << counts_.at(i) << ":" << probabilities_.at(i) << "\t"; } @@ -339,7 +326,8 @@ class ReadEntry { else std::cout << "C" << "\t"; std::cout.precision(6); - std::cout << read_id_ << "\t" << summary.category_name(call_) << "\t" << length_ << "\t" << num_hashes_ << "\t" << mean_quality_ << "\t" << +confidence_score_ << "\t" << compression_ << "\t"; + std::cout << read_id_ << "\t" << summary.category_name(call_) << "\t" << length_ << "\t" << num_hashes_ << "\t" + << mean_quality_ << "\t" << +confidence_score_ << "\t" << compression_ << "\t"; for (auto i = 0; i < summary.num_categories(); i++) { std::cout << summary.categories.at(i) << ":" << counts_.at(i) << ":" << proportions_.at(i) << ":" << unique_proportions_.at(i) << ":" << probabilities_.at(i) << " "; diff --git a/src/classify_main.cpp b/src/classify_main.cpp index 17b14ee..274a287 100644 --- a/src/classify_main.cpp +++ b/src/classify_main.cpp @@ -22,39 +22,29 @@ #include -void setup_classify_subcommand(CLI::App& app) -{ +void setup_classify_subcommand(CLI::App &app) { auto opt = std::make_shared(); - auto* classify_subcommand = app.add_subcommand( - "classify", "Classify read file using index."); + auto *classify_subcommand = app.add_subcommand( + "classify", "Classify read file using index."); classify_subcommand->add_option("", opt->read_file, "Fasta/q file") - ->required() - ->transform(make_absolute) - ->check(CLI::ExistingFile.description("")) - ->type_name("FILE"); + ->required() + ->transform(make_absolute) + ->check(CLI::ExistingFile.description("")) + ->type_name("FILE"); classify_subcommand->add_option("", opt->read_file2, "Paired Fasta/q file") ->transform(make_absolute) ->check(CLI::ExistingFile.description("")) ->type_name("FILE"); - classify_subcommand - ->add_option("--chunk_size", opt->chunk_size, "Read file is read in chunks of this size, to be processed in parallel within a chunk.") - ->type_name("INT") - ->capture_default_str(); - - classify_subcommand - ->add_option("-t,--threads", opt->threads, "Maximum number of threads to use.") - ->type_name("INT") - ->capture_default_str(); - classify_subcommand->add_option("--db", opt->db, "Prefix for the index.") - ->type_name("FILE") - ->required() - ->check(CLI::ExistingPath.description("")); + ->type_name("FILE") + ->required() + ->check(CLI::ExistingPath.description("")); - classify_subcommand->add_option("-e,--extract", opt->category_to_extract, "Reads from this category in the index will be extracted to file.") + classify_subcommand->add_option("-e,--extract", opt->category_to_extract, + "Reads from this category in the index will be extracted to file.") ->type_name("STRING"); classify_subcommand->add_option("-p,--prefix", opt->prefix, "Prefix for the output files.") @@ -62,26 +52,52 @@ void setup_classify_subcommand(CLI::App& app) ->check(CLI::NonexistentPath.description("")) ->default_str(""); + classify_subcommand + ->add_option("--chunk_size", opt->chunk_size, + "Read file is read in chunks of this size, to be processed in parallel within a chunk.") + ->type_name("INT") + ->capture_default_str(); + + classify_subcommand + ->add_option("--lo_hi_threshold", opt->lo_hi_threshold, + "Threshold used during model fitting stage to decide if read should be used to train lo or hi distribution.") + ->type_name("FLOAT") + ->capture_default_str(); + + classify_subcommand + ->add_option("--num_reads_to_fit", opt->num_reads_to_fit, + "Number of reads to use to train each distribution in the model.") + ->type_name("INT") + ->capture_default_str(); + classify_subcommand->add_option("-d,--dist", opt->dist, "Probability distribution to use for modelling.") ->type_name("STRING"); classify_subcommand - ->add_option("--confidence", opt->confidence_threshold, "Minimum difference between the top 2 unique hit counts.") + ->add_option("--min_quality", opt->min_quality, "Minimum read quality to classify.") ->type_name("INT") ->capture_default_str(); classify_subcommand - ->add_option("--min_hits", opt->min_hits, "Minimum difference between the top 2 (non-unique) hit counts.") + ->add_option("--min_length", opt->min_length, "Minimum read length to classify.") ->type_name("INT") ->capture_default_str(); classify_subcommand - ->add_option("--min_length", opt->min_length, "Minimum length of read for classification.") + ->add_option("--min_compression", opt->min_compression, + "Minimum read gzip compression ratio to classify (a measure of how much information is in the read.") + ->type_name("FLOAT") + ->capture_default_str(); + + classify_subcommand + ->add_option("--confidence", opt->confidence_threshold, + "Minimum difference between the top 2 unique hit counts.") ->type_name("INT") ->capture_default_str(); classify_subcommand - ->add_option("--min_diff", opt->min_proportion_difference, "Minimum difference between the proportion of (non-unique) kmers found in each category.") + ->add_option("--min_proportion_diff", opt->min_proportion_difference, + "Minimum difference between the proportion of (non-unique) kmers found in each category.") ->type_name("FLOAT") ->capture_default_str(); @@ -89,17 +105,24 @@ void setup_classify_subcommand(CLI::App& app) ->transform(make_absolute) ->type_name("FILE"); + classify_subcommand + ->add_option("-t,--threads", opt->threads, "Maximum number of threads to use.") + ->type_name("INT") + ->capture_default_str(); + classify_subcommand->add_flag( - "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); + "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); + // Set the function that will be called when this subcommand is issued. classify_subcommand->callback([opt]() { classify_main(*opt); }); } -void classify_reads(const ClassifyArguments& opt, const Index& index){ +void classify_reads(const ClassifyArguments &opt, const Index &index) { PLOG_INFO << "Classifying file " << opt.read_file; - auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, seqan3::window_size{index.window_size()}); + auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, + seqan3::window_size{index.window_size()}); PLOG_VERBOSE << "Defined hash_adaptor"; auto agent = index.agent(); @@ -116,41 +139,40 @@ void classify_reads(const ClassifyArguments& opt, const Index& index){ PLOG_DEBUG << "Defined Result with " << +index.num_bins() << " bins"; - for (auto && chunk : fin | seqan3::views::chunk(opt.chunk_size)) - { + for (auto &&chunk: fin | seqan3::views::chunk(opt.chunk_size)) { // You can use a for loop: - for (auto & record : chunk) - { + for (auto &record: chunk) { records.push_back(std::move(record)); } #pragma omp parallel for firstprivate(agent, hash_adaptor) num_threads(opt.threads) shared(result) - for (auto i=0; i std::numeric_limits::max()){ + if (read_length > std::numeric_limits::max()) { PLOG_WARNING << "Ignoring read " << record.id() << " as too long!"; continue; } - if (read_length == 0){ + if (read_length == 0) { PLOG_WARNING << "Ignoring read " << record.id() << " as has zero length!"; continue; } - auto qualities = record.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); + auto qualities = record.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); auto sum = std::accumulate(qualities.begin(), qualities.end(), 0); float mean_quality = 0; if (std::ranges::size(qualities) > 0) - mean_quality = static_cast< float >( sum )/ static_cast< float >(std::ranges::size(qualities)); + mean_quality = static_cast< float >( sum ) / static_cast< float >(std::ranges::size(qualities)); PLOG_VERBOSE << "Mean quality of read " << record.id() << " is " << mean_quality; float compression_ratio = get_compression_ratio(sequence_to_string(record.sequence())); PLOG_VERBOSE << "Found compression ratio of read " << record.id() << " is " << compression_ratio; auto read = ReadEntry(read_id, read_length, mean_quality, compression_ratio, result.input_summary()); - for (auto && value : record.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } PLOG_VERBOSE << "Finished adding raw hash counts for read " << read_id; @@ -166,10 +188,11 @@ void classify_reads(const ClassifyArguments& opt, const Index& index){ } -void classify_paired_reads(const ClassifyArguments& opt, const Index& index){ +void classify_paired_reads(const ClassifyArguments &opt, const Index &index) { PLOG_INFO << "Classifying files " << opt.read_file << " and " << opt.read_file2; - auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, seqan3::window_size{index.window_size()}); + auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, + seqan3::window_size{index.window_size()}); PLOG_VERBOSE << "Defined hash_adaptor"; auto agent = index.agent(); @@ -188,50 +211,50 @@ void classify_paired_reads(const ClassifyArguments& opt, const Index& index){ PLOG_DEBUG << "Defined Result with " << +index.num_bins() << " bins"; - for (auto && chunk : fin1 | seqan3::views::chunk(opt.chunk_size)) - { - for (auto & record : chunk) - { + for (auto &&chunk: fin1 | seqan3::views::chunk(opt.chunk_size)) { + for (auto &record: chunk) { records1.push_back(std::move(record)); } // loop in the second file and get same amount of reads - for ( auto& record2 : fin2 | std::views::take( opt.chunk_size ) ) - { - records2.push_back( std::move( record2 )); + for (auto &record2: fin2 | std::views::take(opt.chunk_size)) { + records2.push_back(std::move(record2)); } #pragma omp parallel for firstprivate(agent, hash_adaptor) num_threads(opt.threads) shared(result) - for (auto i=0; i std::numeric_limits::max()){ + if (read_length > std::numeric_limits::max()) { PLOG_WARNING << "Ignoring read " << record1.id() << " as too long!"; continue; } - if (read_length == 0){ + if (read_length == 0) { PLOG_WARNING << "Ignoring read " << record1.id() << " as has zero length!"; continue; } - auto qualities1 = record1.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); - auto qualities2 = record2.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); + auto qualities1 = record1.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); + auto qualities2 = record2.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); auto sum = std::accumulate(qualities1.begin(), qualities1.end(), 0); sum = std::accumulate(qualities2.begin(), qualities2.end(), sum); float mean_quality = 0; if (std::ranges::size(qualities1) + std::ranges::size(qualities2) > 0) - mean_quality = static_cast< float >( sum )/ static_cast< float >(std::ranges::size(qualities1) + std::ranges::size(qualities2)); + mean_quality = static_cast< float >( sum ) / + static_cast< float >(std::ranges::size(qualities1) + std::ranges::size(qualities2)); PLOG_VERBOSE << "Mean quality of read " << record1.id() << " is " << mean_quality; auto combined_record = sequence_to_string(record1.sequence()) + sequence_to_string(record2.sequence()); @@ -239,12 +262,12 @@ void classify_paired_reads(const ClassifyArguments& opt, const Index& index){ PLOG_VERBOSE << "Found compression ratio of read " << record1.id() << " is " << compression_ratio; auto read = ReadEntry(read_id, read_length, mean_quality, compression_ratio, result.input_summary()); - for (auto && value : record1.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record1.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } - for (auto && value : record2.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record2.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } PLOG_VERBOSE << "Finished adding raw hash counts for read " << read_id; @@ -261,8 +284,7 @@ void classify_paired_reads(const ClassifyArguments& opt, const Index& index){ } -int classify_main(ClassifyArguments & opt) -{ +int classify_main(ClassifyArguments &opt) { auto log_level = plog::info; if (opt.verbosity == 1) { log_level = plog::debug; @@ -288,14 +310,14 @@ int classify_main(ClassifyArguments & opt) opt.run_extract = (opt.category_to_extract != ""); const auto categories = index.categories(); - if (opt.run_extract and opt.category_to_extract != "all" and std::find(categories.begin(), categories.end(), opt.category_to_extract) == categories.end()) - { + if (opt.run_extract and opt.category_to_extract != "all" and + std::find(categories.begin(), categories.end(), opt.category_to_extract) == categories.end()) { std::string options = ""; for (auto i: categories) options += i + " "; PLOG_ERROR << "Cannot extract " << opt.category_to_extract << ", please chose one of [ all " << options << "]"; return 1; - } else if (opt.run_extract){ + } else if (opt.run_extract) { if (opt.prefix == "") opt.prefix = "charon"; std::vector to_extract; @@ -304,19 +326,20 @@ int classify_main(ClassifyArguments & opt) else to_extract.push_back(opt.category_to_extract); const auto extension = get_extension(opt.read_file); - for (const auto & category : to_extract){ + for (const auto &category: to_extract) { const auto category_index = index.get_category_index(category); if (opt.is_paired) { - opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + "_1" + extension + ".gz"); - opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + "_2" + extension + ".gz"); + opt.extract_category_to_file[category_index].push_back( + opt.prefix + "_" + category + "_1" + extension + ".gz"); + opt.extract_category_to_file[category_index].push_back( + opt.prefix + "_" + category + "_2" + extension + ".gz"); } else { opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + extension + ".gz"); } } } - if (opt.dist != "gamma" and opt.dist != "beta") - { + if (opt.dist != "gamma" and opt.dist != "beta") { PLOG_ERROR << "Supported distributions are [gamma , beta]"; return 1; } From b642c96f1a965e2776a1c72e6e7b9dc75b6cc668 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Mon, 23 Jun 2025 16:23:45 +0100 Subject: [PATCH 4/5] change confidence threshold to 7 for dehost --- include/dehost_arguments.hpp | 37 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/include/dehost_arguments.hpp b/include/dehost_arguments.hpp index fbd0b6c..4371c65 100644 --- a/include/dehost_arguments.hpp +++ b/include/dehost_arguments.hpp @@ -10,41 +10,39 @@ struct DehostArguments { // IO options std::filesystem::path read_file; std::filesystem::path read_file2; - bool is_paired { false }; + bool is_paired{false}; std::string db; // Output options - bool run_extract {false}; + bool run_extract{false}; std::string category_to_extract; std::string prefix; std::unordered_map> extract_category_to_file; - uint8_t chunk_size { 100 }; + uint8_t chunk_size{100}; // Stats options - float lo_hi_threshold {0.15}; - uint16_t num_reads_to_fit {5000}; + float lo_hi_threshold{0.15}; + uint16_t num_reads_to_fit{5000}; std::string dist{"kde"}; // thresholds for filtering - float min_quality { 15.0 }; - uint32_t min_length { 140 }; - float min_compression {0}; - uint8_t confidence_threshold{2}; - float confidence_probability_threshold{5}; - float host_unique_prop_lo_threshold{ 0.05 }; - float min_proportion_difference { 0.04 }; - float min_prob_difference{ 0 }; - + float min_quality{15.0}; + uint32_t min_length{140}; + float min_compression{0}; + uint8_t confidence_threshold{7}; + float confidence_probability_threshold{0}; + float host_unique_prop_lo_threshold{0.05}; + float min_proportion_difference{0.04}; + float min_prob_difference{0}; // General options - std::string log_file {"charon.log"}; - uint8_t threads { 1 }; - uint8_t verbosity { 0 }; + std::string log_file{"charon.log"}; + uint8_t threads{1}; + uint8_t verbosity{0}; - std::string to_string() - { + std::string to_string() { std::string ss; ss += "\n\nDehost Arguments:\n\n"; @@ -70,7 +68,6 @@ struct DehostArguments { ss += "\tmin_prob_difference:\t\t" + std::to_string(min_prob_difference) + "\n\n"; - ss += "\tlog_file:\t\t\t" + log_file + "\n"; ss += "\tthreads:\t\t\t" + std::to_string(threads) + "\n"; ss += "\tverbosity:\t\t\t" + std::to_string(verbosity) + "\n\n"; From 96563e46fdb71b7b67557131ad85eeb453c3fd26 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Mon, 23 Jun 2025 16:25:30 +0100 Subject: [PATCH 5/5] reformat code --- CMakeLists.txt | 42 ++-- include/classify_arguments.hpp | 27 +- include/classify_main.hpp | 7 +- include/classify_stats.hpp | 23 +- include/counts.hpp | 35 ++- include/dehost_main.hpp | 7 +- include/index.hpp | 297 +++++++++++----------- include/index_arguments.hpp | 21 +- include/index_main.hpp | 16 +- include/input_stats.hpp | 84 +++---- include/input_summary.hpp | 99 ++++---- include/load_index.hpp | 2 +- include/result.hpp | 310 +++++++++++------------ include/store_index.hpp | 3 +- include/utils.hpp | 30 ++- src/dehost_main.cpp | 435 +++++++++++++++++---------------- src/index_main.cpp | 152 ++++++------ src/load_index.cpp | 3 +- src/main.cpp | 17 +- src/utils.cpp | 112 ++++----- 20 files changed, 846 insertions(+), 876 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a3f0bb7..ee7dce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # charon # ============================================================================= set(PROJECT_NAME_STR charon) -cmake_minimum_required( VERSION 3.9 FATAL_ERROR ) +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) add_custom_target(version ${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/version.h.in @@ -10,7 +10,7 @@ add_custom_target(version -P ${CMAKE_SOURCE_DIR}/cmake/GenerateVersion.cmake ) -project( ${PROJECT_NAME_STR} LANGUAGES CXX ) +project(${PROJECT_NAME_STR} LANGUAGES CXX) # Use C++20 set(CMAKE_CXX_STANDARD 20) @@ -22,34 +22,34 @@ set(CMAKE_CXX_EXTENSIONS OFF) # ----------------------------------------------------------------------------- # dependencies and 3rd party libraries # ----------------------------------------------------------------------------- -set (PROGRAM_SUBMODULES_DIR - "${CMAKE_CURRENT_LIST_DIR}/lib" - CACHE STRING "Directory containing submodules." +set(PROGRAM_SUBMODULES_DIR + "${CMAKE_CURRENT_LIST_DIR}/lib" + CACHE STRING "Directory containing submodules." ) # Specify the directories where to store the built archives, libraries and executables -set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") -set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") -set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") -set (CMAKE_INSTALL_BINDIR "bin") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +set(CMAKE_INSTALL_BINDIR "bin") # Require OPENMP find_package(OpenMP REQUIRED) -find_package( ZLIB REQUIRED ) +find_package(ZLIB REQUIRED) -set (CPM_INDENT "CMake Package Manager CPM: ") -include (${PROJECT_SOURCE_DIR}/cmake/CPM.cmake) -CPMUsePackageLock (${PROJECT_SOURCE_DIR}/cmake/package-lock.cmake) +set(CPM_INDENT "CMake Package Manager CPM: ") +include(${PROJECT_SOURCE_DIR}/cmake/CPM.cmake) +CPMUsePackageLock(${PROJECT_SOURCE_DIR}/cmake/package-lock.cmake) -CPMGetPackage (seqan3) -CPMGetPackage (plog) -CPMGetPackage (gcem) -CPMGetPackage (statslib) -CPMGetPackage (bzip2) -CPMGetPackage (zlib) -CPMGetPackage (gzip) +CPMGetPackage(seqan3) +CPMGetPackage(plog) +CPMGetPackage(gcem) +CPMGetPackage(statslib) +CPMGetPackage(bzip2) +CPMGetPackage(zlib) +CPMGetPackage(gzip) -add_definitions(-D ZLIB_CONST ) +add_definitions(-D ZLIB_CONST) # ----------------------------------------------------------------------------- # install diff --git a/include/classify_arguments.hpp b/include/classify_arguments.hpp index 994e8e0..ae91dee 100644 --- a/include/classify_arguments.hpp +++ b/include/classify_arguments.hpp @@ -11,36 +11,35 @@ struct ClassifyArguments { // IO options std::filesystem::path read_file; std::filesystem::path read_file2; - bool is_paired { false }; + bool is_paired{false}; std::string db; - uint8_t chunk_size { 100 }; + uint8_t chunk_size{100}; // Stats options - float lo_hi_threshold {0.15}; - uint16_t num_reads_to_fit {5000}; + float lo_hi_threshold{0.15}; + uint16_t num_reads_to_fit{5000}; std::string dist{"beta"}; // thresholds for filtering - float min_quality { 10.0 }; - uint32_t min_length { 140 }; - float min_compression {0.15}; + float min_quality{10.0}; + uint32_t min_length{140}; + float min_compression{0.15}; uint8_t confidence_threshold{2}; - float min_proportion_difference { 0.00 }; + float min_proportion_difference{0.00}; // Output options - bool run_extract {false}; + bool run_extract{false}; std::string category_to_extract; std::string prefix; std::unordered_map> extract_category_to_file; // General options - std::string log_file {"charon.log"}; - uint8_t threads { 1 }; - uint8_t verbosity { 0 }; + std::string log_file{"charon.log"}; + uint8_t threads{1}; + uint8_t verbosity{0}; - std::string to_string() - { + std::string to_string() { std::string ss; ss += "\n\nClassify Arguments:\n\n"; diff --git a/include/classify_main.hpp b/include/classify_main.hpp index a221d94..5bac524 100644 --- a/include/classify_main.hpp +++ b/include/classify_main.hpp @@ -11,13 +11,14 @@ #include "result.hpp" class Index; + struct ClassifyArguments; -void setup_classify_subcommand(CLI::App& app); +void setup_classify_subcommand(CLI::App &app); -void classify_reads(const ClassifyArguments& opt, const Index& index); +void classify_reads(const ClassifyArguments &opt, const Index &index); -int classify_main(ClassifyArguments & opt); +int classify_main(ClassifyArguments &opt); #endif // CHARON_CLASSIFY_MAIN_H diff --git a/include/classify_stats.hpp b/include/classify_stats.hpp index 12be7cd..6b1d36b 100644 --- a/include/classify_stats.hpp +++ b/include/classify_stats.hpp @@ -217,7 +217,7 @@ struct KDEParams { std::sort(dataset.begin(), dataset.end()); } - float quantile(const float& alpha) const { + float quantile(const float &alpha) const { int idx = std::ceil((1. - alpha) * dataset.size()); return dataset[idx]; } @@ -227,7 +227,7 @@ struct KDEParams { const auto mu = mean(dataset); const auto var = variance(dataset, mu); const auto std = sqrt(var); - const auto A = std::min(iqr/1.34, std); + const auto A = std::min(iqr / 1.34, std); h = 0.9 * A * pow(dataset.size(), -0.2); } @@ -239,16 +239,16 @@ struct KDEParams { PLOG_INFO << "Fitting KDE to data with factor " << h; } - float K(const float & x) const { - return std::exp(-std::pow(x,2)/2)/sqrt(2*3.141592653589793238463); + float K(const float &x) const { + return std::exp(-std::pow(x, 2) / 2) / sqrt(2 * 3.141592653589793238463); } - float prob(const float & x) const { + float prob(const float &x) const { float total_sum = 0; - for (const auto &xi : dataset){ + for (const auto &xi: dataset) { total_sum += K((x - xi) / h); } - return total_sum/(h*dataset.size()); + return total_sum / (h * dataset.size()); } }; @@ -267,8 +267,8 @@ class Model { GammaParams g_neg{10, 0, 0.005}; BetaParams b_pos{6, 4}; BetaParams b_neg{6, 40}; - KDEParams k_pos{default_pos_data,0.1}; - KDEParams k_neg{default_neg_data,0.001}; + KDEParams k_pos{default_pos_data, 0.1}; + KDEParams k_neg{default_neg_data, 0.001}; public: Model() = default; @@ -308,7 +308,8 @@ class Model { g_neg.fit_loc(training_data.neg); //auto ad = g_neg.calculate_anderson_darling(training_data.neg); PLOG_INFO << "Model " << +id << " using default for g_neg data with Gamma (shape:" << g_neg.shape - << ", loc: " << g_neg.loc << ", scale: " << g_neg.scale << ")";//. Anderson-darling statistic is " << ad; + << ", loc: " << g_neg.loc << ", scale: " << g_neg.scale + << ")";//. Anderson-darling statistic is " << ad; } } @@ -447,7 +448,7 @@ class StatsModel { confidence_probability_threshold_(opt.confidence_probability_threshold), host_unique_prop_lo_threshold_(opt.host_unique_prop_lo_threshold), min_proportion_difference_(opt.min_proportion_difference), - min_prob_difference_(opt.min_prob_difference){ + min_prob_difference_(opt.min_prob_difference) { for (auto i = 0; i < summary.num_categories(); ++i) { models_.emplace_back(Model(i, opt.dist)); training_data_.emplace_back(TrainingData(opt, i)); diff --git a/include/counts.hpp b/include/counts.hpp index 3853e60..fe43e70 100644 --- a/include/counts.hpp +++ b/include/counts.hpp @@ -10,44 +10,43 @@ // Extremly simple Counts class - represents a lower diagonal square matrix including the diagonal, // expecting [row,col]==[col,row] so can store it once not twice template -class Counts -{ +class Counts { public: - Counts(){}; - Counts(size_t size) - { + Counts() {}; + + Counts(size_t size) { assert(size > 0); mRows = size; - mData.resize(mRows * (mRows+1) / 2); + mData.resize(mRows * (mRows + 1) / 2); mData.shrink_to_fit(); } - void set_size(size_t size) - { + + void set_size(size_t size) { assert(size > 0); mRows = size; - mData.resize(mRows * (mRows+1) / 2); + mData.resize(mRows * (mRows + 1) / 2); mData.shrink_to_fit(); } - T& operator()(size_t row, size_t col) - { + + T &operator()(size_t row, size_t col) { assert(row < mRows); assert(col < mRows); - if (row < col) - { - col,row = row,col; + if (row < col) { + col, row = row, col; } return mData[row * (row + 1) / 2 + col]; } - const T& operator()(size_t row, size_t col) const - { + + const T &operator()(size_t row, size_t col) const { assert(row >= 0 && row < mRows); assert(col >= 0 && row < mRows); return mData[row * (row + 1) / 2 + col]; } - size_t rows() const noexcept - { + + size_t rows() const noexcept { return mRows; } + protected: size_t mRows; std::vector mData; diff --git a/include/dehost_main.hpp b/include/dehost_main.hpp index cab29b4..1f451b7 100644 --- a/include/dehost_main.hpp +++ b/include/dehost_main.hpp @@ -13,13 +13,14 @@ #include "result.hpp" class Index; + struct DehostArguments; -void setup_dehost_subcommand(CLI::App& app); +void setup_dehost_subcommand(CLI::App &app); -void dehost_reads(const DehostArguments& opt, const Index& index); +void dehost_reads(const DehostArguments &opt, const Index &index); -int dehost_main(DehostArguments & opt); +int dehost_main(DehostArguments &opt); #endif // CHARON_DEHOST_MAIN_H diff --git a/include/index.hpp b/include/index.hpp index 97b81d0..f1b0140 100644 --- a/include/index.hpp +++ b/include/index.hpp @@ -16,166 +16,151 @@ #include -class Index -{ - private: - uint8_t window_size_{}; - uint8_t kmer_size_{}; - double max_fpr_{}; - InputSummary summary_{}; - InputStats stats_{}; - seqan3::interleaved_bloom_filter ibf_{}; - - public: - static constexpr uint32_t version{3u}; - - Index() = default; - Index(Index const &) = default; - Index(Index &&) = default; - Index & operator=(Index const &) = default; - Index & operator=(Index &&) = default; - ~Index() = default; - - Index(const IndexArguments & arguments, const InputSummary & summary, const InputStats & stats, const seqan3::interleaved_bloom_filter& ibf): - window_size_{arguments.window_size}, - kmer_size_{arguments.kmer_size}, - max_fpr_{arguments.max_fpr}, - summary_{summary}, - stats_{stats}, - ibf_(ibf) - {} - - uint8_t window_size() const - { - return window_size_; +class Index { +private: + uint8_t window_size_{}; + uint8_t kmer_size_{}; + double max_fpr_{}; + InputSummary summary_{}; + InputStats stats_{}; + seqan3::interleaved_bloom_filter ibf_{}; + +public: + static constexpr uint32_t version{3u}; + + Index() = default; + + Index(Index const &) = default; + + Index(Index &&) = default; + + Index &operator=(Index const &) = default; + + Index &operator=(Index &&) = default; + + ~Index() = default; + + Index(const IndexArguments &arguments, const InputSummary &summary, const InputStats &stats, + const seqan3::interleaved_bloom_filter &ibf) : + window_size_{arguments.window_size}, + kmer_size_{arguments.kmer_size}, + max_fpr_{arguments.max_fpr}, + summary_{summary}, + stats_{stats}, + ibf_(ibf) {} + + uint8_t window_size() const { + return window_size_; + } + + uint8_t kmer_size() const { + return kmer_size_; + } + + uint8_t num_bins() const { + return summary_.num_bins; + } + + uint8_t num_categories() const { + return summary_.num_categories(); + } + + std::vector categories() const { + return summary_.categories; + } + + uint8_t get_host_index() const { + const auto index1 = summary_.category_index("host"); + const auto index2 = summary_.category_index("human"); + auto index = std::min(index1, index2); + if (index == std::numeric_limits::max()) + PLOG_ERROR << "Index does not contain 'host' or 'human' as a category "; + assert(index < std::numeric_limits::max()); + return index; + } + + uint8_t get_category_index(const std::string category) const { + const auto index = summary_.category_index(category); + if (index == std::numeric_limits::max()) + PLOG_ERROR << "Index does not contain category "; + assert(index < std::numeric_limits::max()); + return index; + } + + double max_fpr() const { + return max_fpr_; + } + + InputStats stats() const { + return stats_; + } + + InputSummary summary() const { + return summary_; + } + + std::unordered_map bin_to_category() const { + return summary_.bin_to_category; + } + + seqan3::interleaved_bloom_filter const &ibf() const { + return ibf_; + } + + seqan3::interleaved_bloom_filter::membership_agent_type agent() const { + return ibf_.membership_agent(); + } + + /*!\cond DEV + * \brief Serialisation support function. + * \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_archive. + * \param[in] archive The archive being serialised from/to. + * + * \attention These functions are never called directly. + * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation + */ + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t &archive) { + try { + archive(window_size_); + archive(kmer_size_); + archive(max_fpr_); + archive(summary_); + archive(stats_); + archive(ibf_); } - - uint8_t kmer_size() const - { - return kmer_size_; - } - - uint8_t num_bins() const - { - return summary_.num_bins; - } - - uint8_t num_categories() const - { - return summary_.num_categories(); - } - - std::vector categories() const - { - return summary_.categories; - } - - uint8_t get_host_index() const - { - const auto index1 = summary_.category_index("host"); - const auto index2 = summary_.category_index("human"); - auto index = std::min(index1, index2); - if (index == std::numeric_limits::max()) - PLOG_ERROR << "Index does not contain 'host' or 'human' as a category "; - assert(index < std::numeric_limits::max()); - return index; - } - - uint8_t get_category_index(const std::string category) const - { - const auto index = summary_.category_index(category); - if (index == std::numeric_limits::max()) - PLOG_ERROR << "Index does not contain category "; - assert(index < std::numeric_limits::max()); - return index; - } - - double max_fpr() const - { - return max_fpr_; + // GCOVR_EXCL_START + catch (std::exception const &e) { + PLOG_ERROR << "Cannot read index: " + std::string{e.what()}; + exit(1); } - InputStats stats() const - { - return stats_; + } + + /* \brief Serialisation support function. Do not load the actual data. + * \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_input_archive. + * \param[in] archive The archive being serialised from/to. + * + * \attention These functions are never called directly. + * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation + */ + template + void load_parameters(archive_t &archive) { + try { + archive(window_size_); + archive(kmer_size_); + archive(max_fpr_); + archive(summary_); + archive(stats_); + archive(ibf_); } - - InputSummary summary() const - { - return summary_; - } - - std::unordered_map bin_to_category() const - { - return summary_.bin_to_category; - } - - seqan3::interleaved_bloom_filter const & ibf() const - { - return ibf_; - } - - seqan3::interleaved_bloom_filter::membership_agent_type agent() const - { - return ibf_.membership_agent(); - } - - /*!\cond DEV - * \brief Serialisation support function. - * \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_archive. - * \param[in] archive The archive being serialised from/to. - * - * \attention These functions are never called directly. - * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation - */ - template - void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) - { - try - { - archive(window_size_); - archive(kmer_size_); - archive(max_fpr_); - archive(summary_); - archive(stats_); - archive(ibf_); - } - // GCOVR_EXCL_START - catch (std::exception const & e) - { - PLOG_ERROR << "Cannot read index: " + std::string{e.what()}; - exit(1); - } - - } - - /* \brief Serialisation support function. Do not load the actual data. - * \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_input_archive. - * \param[in] archive The archive being serialised from/to. - * - * \attention These functions are never called directly. - * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation - */ - template - void load_parameters(archive_t & archive) - { - try - { - archive(window_size_); - archive(kmer_size_); - archive(max_fpr_); - archive(summary_); - archive(stats_); - archive(ibf_); - } - // GCOVR_EXCL_START - catch (std::exception const & e) - { - PLOG_ERROR << "Cannot read index: " + std::string{e.what()}; - exit(1); - } + // GCOVR_EXCL_START + catch (std::exception const &e) { + PLOG_ERROR << "Cannot read index: " + std::string{e.what()}; + exit(1); } - //!\endcond - }; + } + //!\endcond +}; #endif // CHARON_INDEX_H diff --git a/include/index_arguments.hpp b/include/index_arguments.hpp index b02d1c8..6b26105 100644 --- a/include/index_arguments.hpp +++ b/include/index_arguments.hpp @@ -13,22 +13,21 @@ struct IndexArguments { std::string tmp_dir; // kmer/sketching - uint8_t window_size { 41 }; - uint8_t kmer_size { 19 }; + uint8_t window_size{41}; + uint8_t kmer_size{19}; // IBF options - mutable size_t bits {std::numeric_limits::max()-2}; // Allow to change bits for each partition - uint8_t num_hash {3}; - double max_fpr {0.01}; + mutable size_t bits{std::numeric_limits::max() - 2}; // Allow to change bits for each partition + uint8_t num_hash{3}; + double max_fpr{0.01}; // General options - std::string log_file {"charon.log"}; - uint8_t threads { 1 }; - uint8_t verbosity { 0 }; - bool optimize { false }; + std::string log_file{"charon.log"}; + uint8_t threads{1}; + uint8_t verbosity{0}; + bool optimize{false}; - std::string to_string() - { + std::string to_string() { std::string ss; ss += "\n\nIndex Arguments:\n\n"; diff --git a/include/index_main.hpp b/include/index_main.hpp index 6e99ef8..a323ab6 100644 --- a/include/index_main.hpp +++ b/include/index_main.hpp @@ -12,21 +12,25 @@ #include "index_arguments.hpp" class Index; + class InputStats; + class InputSummary; -void setup_index_subcommand(CLI::App& app); +void setup_index_subcommand(CLI::App &app); -InputSummary parse_input_file(const std::filesystem::path& input_file); +InputSummary parse_input_file(const std::filesystem::path &input_file); -InputStats count_and_store_hashes(const IndexArguments& opt, const InputSummary& summary); +InputStats count_and_store_hashes(const IndexArguments &opt, const InputSummary &summary); -std::unordered_map> optimize_layout(const IndexArguments& opt, InputSummary& summary, InputStats& stats); +std::unordered_map> +optimize_layout(const IndexArguments &opt, InputSummary &summary, InputStats &stats); -Index build_index(const IndexArguments& opt, const InputSummary& summary, InputStats& stats, const std::unordered_map>& bucket_to_bins_map); +Index build_index(const IndexArguments &opt, const InputSummary &summary, InputStats &stats, + const std::unordered_map> &bucket_to_bins_map); -int index_main(IndexArguments & opt); +int index_main(IndexArguments &opt); #endif // CHARON_INDEX_MAIN_H diff --git a/include/input_stats.hpp b/include/input_stats.hpp index a2ee2cb..77147b1 100644 --- a/include/input_stats.hpp +++ b/include/input_stats.hpp @@ -12,71 +12,67 @@ #include -struct InputStats -{ - //uint8_t num_bins{0}; - uint32_t num_files{0}; - std::unordered_map records_per_bin{}; - std::unordered_map hashes_per_bin{}; - - public: - InputStats() = default; - InputStats(InputStats const &) = default; - InputStats(InputStats &&) = default; - InputStats & operator=(InputStats const &) = default; - InputStats & operator=(InputStats &&) = default; - ~InputStats() = default; - - std::vector > bins_by_size() const - { - std::vector > sorted_pairs; - for (auto& it : hashes_per_bin) { - sorted_pairs.push_back(it); - } - std::sort(sorted_pairs.begin(), sorted_pairs.end(), [](auto &left, auto &right) { - return left.second < right.second; - }); - return sorted_pairs; - } +struct InputStats { + //uint8_t num_bins{0}; + uint32_t num_files{0}; + std::unordered_map records_per_bin{}; + std::unordered_map hashes_per_bin{}; + +public: + InputStats() = default; + + InputStats(InputStats const &) = default; + + InputStats(InputStats &&) = default; + + InputStats &operator=(InputStats const &) = default; - uint64_t max_num_hashes() - { - auto sorted_pairs = bins_by_size(); - return sorted_pairs.back().second; + InputStats &operator=(InputStats &&) = default; + + ~InputStats() = default; + + std::vector > bins_by_size() const { + std::vector > sorted_pairs; + for (auto &it: hashes_per_bin) { + sorted_pairs.push_back(it); } + std::sort(sorted_pairs.begin(), sorted_pairs.end(), [](auto &left, auto &right) { + return left.second < right.second; + }); + return sorted_pairs; + } + + uint64_t max_num_hashes() { + auto sorted_pairs = bins_by_size(); + return sorted_pairs.back().second; + } - template - void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) - { - try - { + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t &archive) { + try { //archive(num_bins); archive(num_files); archive(records_per_bin); archive(hashes_per_bin); } // GCOVR_EXCL_START - catch (std::exception const & e) - { + catch (std::exception const &e) { PLOG_ERROR << "Cannot read input_stats: " + std::string{e.what()}; exit(1); } } - template - void load_parameters(archive_t & archive) - { - try - { + template + void load_parameters(archive_t &archive) { + try { //archive(num_bins); archive(num_files); archive(records_per_bin); archive(hashes_per_bin); } // GCOVR_EXCL_START - catch (std::exception const & e) - { + catch (std::exception const &e) { PLOG_ERROR << "Cannot read input_stats: " + std::string{e.what()}; exit(1); } diff --git a/include/input_summary.hpp b/include/input_summary.hpp index 32901cd..9b1abfd 100644 --- a/include/input_summary.hpp +++ b/include/input_summary.hpp @@ -13,87 +13,80 @@ #include #include -struct InputSummary -{ +struct InputSummary { uint8_t num_bins{0}; std::vector categories; std::vector> filepath_to_bin; std::unordered_map bin_to_category; - public: - InputSummary() = default; - InputSummary(InputSummary const &) = default; - InputSummary(InputSummary &&) = default; - InputSummary & operator=(InputSummary const &) = default; - InputSummary & operator=(InputSummary &&) = default; - ~InputSummary() = default; - - uint8_t num_categories() const - { - return categories.size(); - } +public: + InputSummary() = default; - uint8_t category_index(const std::string category) const - { - for (auto i=0; i::max(); - } + InputSummary(InputSummary const &) = default; - uint8_t host_category_index() const - { - auto index1 = category_index("human"); - auto index2 = category_index("host"); - auto index = std::min(index1, index2); - if (index == std::numeric_limits::max()) - PLOG_ERROR << "Neither 'human' nor 'host' appear as categories in the index"; - assert(index != std::numeric_limits::max()); - return index; - } + InputSummary(InputSummary &&) = default; + + InputSummary &operator=(InputSummary const &) = default; - std::string category_name(const uint8_t index) const - { - if (index > categories.size()) - return ""; - else - return categories.at(index); + InputSummary &operator=(InputSummary &&) = default; + + ~InputSummary() = default; + + uint8_t num_categories() const { + return categories.size(); + } + + uint8_t category_index(const std::string category) const { + for (auto i = 0; i < categories.size(); ++i) { + if (category == categories.at(i)) + return i; } + return std::numeric_limits::max(); + } + + uint8_t host_category_index() const { + auto index1 = category_index("human"); + auto index2 = category_index("host"); + auto index = std::min(index1, index2); + if (index == std::numeric_limits::max()) + PLOG_ERROR << "Neither 'human' nor 'host' appear as categories in the index"; + assert(index != std::numeric_limits::max()); + return index; + } + + std::string category_name(const uint8_t index) const { + if (index > categories.size()) + return ""; + else + return categories.at(index); + } - template - void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) - { - try - { + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t &archive) { + try { archive(num_bins); archive(categories); archive(filepath_to_bin); archive(bin_to_category); } // GCOVR_EXCL_START - catch (std::exception const & e) - { + catch (std::exception const &e) { PLOG_ERROR << "Cannot read input_summary: " + std::string{e.what()}; exit(1); } } - template - void load_parameters(archive_t & archive) - { - try - { + template + void load_parameters(archive_t &archive) { + try { archive(num_bins); archive(categories); archive(filepath_to_bin); archive(bin_to_category); } // GCOVR_EXCL_START - catch (std::exception const & e) - { + catch (std::exception const &e) { PLOG_ERROR << "Cannot read input_summary: " + std::string{e.what()}; exit(1); } diff --git a/include/load_index.hpp b/include/load_index.hpp index 0b9ad0d..91f75a4 100644 --- a/include/load_index.hpp +++ b/include/load_index.hpp @@ -6,6 +6,6 @@ #include #include -void load_index(Index & index, std::filesystem::path const & path); +void load_index(Index &index, std::filesystem::path const &path); #endif // CHARON_LOAD_INDEX_MAIN_H \ No newline at end of file diff --git a/include/result.hpp b/include/result.hpp index e6661af..39f4a6a 100644 --- a/include/result.hpp +++ b/include/result.hpp @@ -15,15 +15,13 @@ #include "input_summary.hpp" #include "classify_stats.hpp" -struct ResultSummary -{ +struct ResultSummary { std::vector classified_counts; uint64_t unclassified_count{0}; uint64_t extracted_count{0}; - ResultSummary(const uint8_t size): - classified_counts(size,0) - {}; + ResultSummary(const uint8_t size) : + classified_counts(size, 0) {}; }; template @@ -35,195 +33,185 @@ struct ReadRecord { }; template -class Result -{ - private: - InputSummary input_summary_; - ResultSummary result_summary_; - StatsModel stats_model_; - std::vector> cached_reads_; - - bool run_extract_; - std::unordered_map>> extract_handles_; - - public: - Result() = default; - Result(Result const &) = default; - Result(Result &&) = default; - Result & operator=(Result const &) = default; - Result & operator=(Result &&) = default; - ~Result() = default; - - Result(const ClassifyArguments& opt, const InputSummary & summary): - input_summary_{summary}, - result_summary_(summary.num_categories()), - run_extract_(opt.run_extract) - { - stats_model_ = StatsModel(opt, summary); - if (opt.run_extract){ - for (const auto [category_index,extract_files] : opt.extract_category_to_file) { - for (const auto extract_file : extract_files) - extract_handles_[category_index].push_back(seqan3::sequence_file_output{extract_file}); - cached_reads_.reserve(opt.num_reads_to_fit*summary.num_categories()*4); - } - } +class Result { +private: + InputSummary input_summary_; + ResultSummary result_summary_; + StatsModel stats_model_; + std::vector> cached_reads_; + bool run_extract_; + std::unordered_map>> extract_handles_; - }; +public: + Result() = default; - Result(const DehostArguments& opt, const InputSummary & summary): - input_summary_{summary}, - result_summary_(summary.num_categories()), - run_extract_(opt.run_extract) - { - stats_model_ = StatsModel(opt, summary); - if (opt.run_extract){ - for (const auto [category_index,extract_files] : opt.extract_category_to_file) { - for (const auto extract_file : extract_files) - extract_handles_[category_index].push_back(seqan3::sequence_file_output{extract_file}); - cached_reads_.reserve(opt.num_reads_to_fit*summary.num_categories()*4); - } - } - }; + Result(Result const &) = default; - const InputSummary& input_summary() const - { - return input_summary_; + Result(Result &&) = default; + + Result &operator=(Result const &) = default; + + Result &operator=(Result &&) = default; + + ~Result() = default; + + Result(const ClassifyArguments &opt, const InputSummary &summary) : + input_summary_{summary}, + result_summary_(summary.num_categories()), + run_extract_(opt.run_extract) { + stats_model_ = StatsModel(opt, summary); + if (opt.run_extract) { + for (const auto [category_index, extract_files]: opt.extract_category_to_file) { + for (const auto extract_file: extract_files) + extract_handles_[category_index].push_back(seqan3::sequence_file_output{extract_file}); + cached_reads_.reserve(opt.num_reads_to_fit * summary.num_categories() * 4); + } } - const uint8_t category_index(const std::string& category) const - { - return input_summary_.category_index(category); + + }; + + Result(const DehostArguments &opt, const InputSummary &summary) : + input_summary_{summary}, + result_summary_(summary.num_categories()), + run_extract_(opt.run_extract) { + stats_model_ = StatsModel(opt, summary); + if (opt.run_extract) { + for (const auto [category_index, extract_files]: opt.extract_category_to_file) { + for (const auto extract_file: extract_files) + extract_handles_[category_index].push_back(seqan3::sequence_file_output{extract_file}); + cached_reads_.reserve(opt.num_reads_to_fit * summary.num_categories() * 4); + } } + }; - uint8_t classify_read(ReadEntry& read_entry, const bool dehost=false) - { - PLOG_VERBOSE << "Classify read " << read_entry.read_id(); - if (dehost) - read_entry.dehost(stats_model_, input_summary_.host_category_index()); - else - read_entry.classify(stats_model_); + const InputSummary &input_summary() const { + return input_summary_; + } + + const uint8_t category_index(const std::string &category) const { + return input_summary_.category_index(category); + } + + uint8_t classify_read(ReadEntry &read_entry, const bool dehost = false) { + PLOG_VERBOSE << "Classify read " << read_entry.read_id(); + if (dehost) + read_entry.dehost(stats_model_, input_summary_.host_category_index()); + else + read_entry.classify(stats_model_); #pragma omp critical(print_result) - read_entry.print_assignment_result(input_summary_); + read_entry.print_assignment_result(input_summary_); #pragma omp critical(update_result_count) - { - const auto &call = read_entry.call(); - if (call < std::numeric_limits::max()) - { - result_summary_.classified_counts[call] += 1; - } else { - result_summary_.unclassified_count += 1; - } + { + const auto &call = read_entry.call(); + if (call < std::numeric_limits::max()) { + result_summary_.classified_counts[call] += 1; + } else { + result_summary_.unclassified_count += 1; } - return read_entry.call(); - } + return read_entry.call(); - void extract_read(const uint8_t category_index, const record_type& record) - { + } + + void extract_read(const uint8_t category_index, const record_type &record) { #pragma omp critical(extract_read) - extract_handles_[category_index][0].push_back(record); - } + extract_handles_[category_index][0].push_back(record); + } - void extract_paired_read(const uint8_t category_index, const record_type& record, const record_type& record2) - { + void extract_paired_read(const uint8_t category_index, const record_type &record, const record_type &record2) { #pragma omp critical(extract_read) - extract_handles_[category_index][0].push_back(record); + extract_handles_[category_index][0].push_back(record); #pragma omp critical(extract_read2) - extract_handles_[category_index][1].push_back(record2); - } - - void add_read(ReadEntry& read_entry, const record_type& record, bool dehost=false){ - if (stats_model_.ready()) { - auto category_index = classify_read(read_entry, dehost); - if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()){ - extract_read(category_index, record); - } - } else { - PLOG_VERBOSE << "Add read " << read_entry.read_id() << " to training "; + extract_handles_[category_index][1].push_back(record2); + } + + void add_read(ReadEntry &read_entry, const record_type &record, bool dehost = false) { + if (stats_model_.ready()) { + auto category_index = classify_read(read_entry, dehost); + if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()) { + extract_read(category_index, record); + } + } else { + PLOG_VERBOSE << "Add read " << read_entry.read_id() << " to training "; #pragma omp critical(add_to_cache) - { - bool training_complete = false; - if (cached_reads_.size() < cached_reads_.capacity()) - { - cached_reads_.emplace_back(ReadRecord(false, read_entry, record, record)); - training_complete = stats_model_.add_read_to_training_data(read_entry.unique_proportions()); - } else { - stats_model_.force_ready(); - training_complete = true; - } - - if (training_complete) - classify_cache(dehost); + { + bool training_complete = false; + if (cached_reads_.size() < cached_reads_.capacity()) { + cached_reads_.emplace_back(ReadRecord(false, read_entry, record, record)); + training_complete = stats_model_.add_read_to_training_data(read_entry.unique_proportions()); + } else { + stats_model_.force_ready(); + training_complete = true; } + + if (training_complete) + classify_cache(dehost); } } - - void add_paired_read(ReadEntry& read_entry, const record_type& record, const record_type& record2, const bool dehost=false){ - if (stats_model_.ready()) { - auto category_index = classify_read(read_entry, dehost); - if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()){ - extract_paired_read(category_index, record, record2); - } - } else { - PLOG_VERBOSE << "Add read " << read_entry.read_id() << " to training "; + } + + void add_paired_read(ReadEntry &read_entry, const record_type &record, const record_type &record2, + const bool dehost = false) { + if (stats_model_.ready()) { + auto category_index = classify_read(read_entry, dehost); + if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()) { + extract_paired_read(category_index, record, record2); + } + } else { + PLOG_VERBOSE << "Add read " << read_entry.read_id() << " to training "; #pragma omp critical(add_to_cache) - { - bool training_complete = false; - if (cached_reads_.size() < cached_reads_.capacity()) - { - cached_reads_.emplace_back(ReadRecord(true, read_entry, record, record2)); - training_complete = stats_model_.add_read_to_training_data(read_entry.unique_proportions()); - } else { - stats_model_.force_ready(); - training_complete = true; - } - - if (training_complete) - classify_cache(dehost); + { + bool training_complete = false; + if (cached_reads_.size() < cached_reads_.capacity()) { + cached_reads_.emplace_back(ReadRecord(true, read_entry, record, record2)); + training_complete = stats_model_.add_read_to_training_data(read_entry.unique_proportions()); + } else { + stats_model_.force_ready(); + training_complete = true; } + + if (training_complete) + classify_cache(dehost); } } - - void classify_cache(const bool dehost=false) - { - PLOG_VERBOSE << "Classify cached reads"; - - for (auto read_record : cached_reads_) - { - auto & read_entry = read_record.read; - const auto & record = read_record.record; - auto category_index = classify_read(read_entry, dehost); - if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()){ - if (read_record.is_paired) - { - const auto & record2 = read_record.record2; - extract_paired_read(category_index, record, record2); - } else { - extract_read(category_index, record); - } + } + + void classify_cache(const bool dehost = false) { + PLOG_VERBOSE << "Classify cached reads"; + + for (auto read_record: cached_reads_) { + auto &read_entry = read_record.read; + const auto &record = read_record.record; + auto category_index = classify_read(read_entry, dehost); + if (run_extract_ and extract_handles_.find(category_index) != extract_handles_.end()) { + if (read_record.is_paired) { + const auto &record2 = read_record.record2; + extract_paired_read(category_index, record, record2); + } else { + extract_read(category_index, record); } } - cached_reads_.resize(0); - } - - void complete(const bool dehost=false) - { - classify_cache(dehost); } + cached_reads_.resize(0); + } + void complete(const bool dehost = false) { + classify_cache(dehost); + } - void print_summary() const { + void print_summary() const { - PLOG_INFO << "Results summary: "; - for (auto i = 0; i < result_summary_.classified_counts.size(); i++) { - const auto & category = input_summary_.categories.at(i); - PLOG_INFO << category << " :\t\t" << result_summary_.classified_counts.at(i); - } - PLOG_INFO << "unclassified :\t" << result_summary_.unclassified_count; + PLOG_INFO << "Results summary: "; + for (auto i = 0; i < result_summary_.classified_counts.size(); i++) { + const auto &category = input_summary_.categories.at(i); + PLOG_INFO << category << " :\t\t" << result_summary_.classified_counts.at(i); } - }; + PLOG_INFO << "unclassified :\t" << result_summary_.unclassified_count; + } +}; #endif // CHARON_RESULT_H diff --git a/include/store_index.hpp b/include/store_index.hpp index 9ffa2ce..e6f2f4c 100644 --- a/include/store_index.hpp +++ b/include/store_index.hpp @@ -9,8 +9,7 @@ #include -static inline void store_index(std::filesystem::path const & path, Index && index) -{ +static inline void store_index(std::filesystem::path const &path, Index &&index) { PLOG_INFO << "Saving index to file " << path; std::ofstream os{path, std::ios::binary}; cereal::BinaryOutputArchive oarchive{os}; diff --git a/include/utils.hpp b/include/utils.hpp index 4fdf67a..a48ab4b 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -14,33 +14,37 @@ class IndexArguments; -struct my_traits : seqan3::sequence_file_input_default_traits_dna -{ +struct my_traits : seqan3::sequence_file_input_default_traits_dna { using quality_alphabet = seqan3::phred94; }; // used to transform paths to absolute paths - designed to be used with CLI11 transform std::filesystem::path make_absolute(std::filesystem::path); -std::vector split(const std::string&, const std::string&); +std::vector split(const std::string &, const std::string &); bool ends_with(std::string str, std::string suffix); + bool starts_with(std::string str, std::string prefix); -void store_hashes( const std::string target, - const std::unordered_set< uint64_t >& hashes, - const std::string tmp_output_folder ); -std::vector< uint64_t > load_hashes( const std::string target, - const std::string tmp_output_folder ); -void delete_hashes( const std::vector& targets, const std::string tmp_output_folder ); +void store_hashes(const std::string target, + const std::unordered_set &hashes, + const std::string tmp_output_folder); + +std::vector load_hashes(const std::string target, + const std::string tmp_output_folder); -size_t bin_size_in_bits(const IndexArguments & opt, const uint64_t & num_elements); +void delete_hashes(const std::vector &targets, const std::string tmp_output_folder); -size_t max_num_hashes_for_fpr(const IndexArguments & opt); +size_t bin_size_in_bits(const IndexArguments &opt, const uint64_t &num_elements); -std::string sequence_to_string(const __type_pack_element<0, std::vector, std::string, std::vector>& input); +size_t max_num_hashes_for_fpr(const IndexArguments &opt); -float get_compression_ratio(const std::string& sequence); +std::string sequence_to_string( + const __type_pack_element<0, std::vector, std::string, std::vector> &input); + +float get_compression_ratio(const std::string &sequence); std::string get_extension(const std::filesystem::path); + #endif diff --git a/src/dehost_main.cpp b/src/dehost_main.cpp index 2cd6b75..0931157 100644 --- a/src/dehost_main.cpp +++ b/src/dehost_main.cpp @@ -20,202 +20,201 @@ #include #include -const std::vector default_pos_data = {0.603448, 0.480363, 0.411268, 0.444444, 0.392523, 0.31383 , - 0.281447, 0.53539 , 0.553792, 0.537906, 0.273994, 0.693023, +const std::vector default_pos_data = {0.603448, 0.480363, 0.411268, 0.444444, 0.392523, 0.31383, + 0.281447, 0.53539, 0.553792, 0.537906, 0.273994, 0.693023, 0.680751, 0.428571, 0.254005, 0.696133, 0.251462, 0.594203, 0.361582, 0.515789, 0.724771, 0.312821, 0.457627, 0.728614, 0.573222, 0.440476, 0.381633, 0.234127, 0.673988, 0.368421, 0.548387, 0.304688, 0.365348, 0.280899, 0.482315, 0.445283, - 0.449541, 0.59589 , 0.671362, 0.59854 , 0.445415, 0.394265, - 0.525597, 0.201365, 0.702703, 0.47505 , 0.509174, 0.692308, + 0.449541, 0.59589, 0.671362, 0.59854, 0.445415, 0.394265, + 0.525597, 0.201365, 0.702703, 0.47505, 0.509174, 0.692308, 0.230573, 0.553616, 0.653846, 0.690476, 0.432203, 0.674377, 0.298387, 0.626712, 0.589347, 0.545045, 0.624679, 0.240658, 0.254003, 0.746667, 0.357143, 0.606909, 0.344558, 0.409836, - 0.266998, 0.588235, 0.537815, 0.50641 , 0.371951, 0.415929, - 0.689266, 0.309735, 0.705645, 0.549296, 0.80754 , 0.398979, - 0.459941, 0.491647, 0.292453, 0.43871 , 0.649254, 0.309154, + 0.266998, 0.588235, 0.537815, 0.50641, 0.371951, 0.415929, + 0.689266, 0.309735, 0.705645, 0.549296, 0.80754, 0.398979, + 0.459941, 0.491647, 0.292453, 0.43871, 0.649254, 0.309154, 0.617021, 0.601671, 0.383721, 0.628571, 0.487936, 0.440367, 0.382465, 0.660297, 0.742489, 0.737143, 0.282759, 0.490196, 0.247191, 0.690994, 0.731602, 0.666667, 0.535714, 0.322368, 0.694389, 0.273333, 0.783972, 0.307692, 0.284091, 0.661333, 0.366492, 0.733728, 0.352113, 0.803922, 0.269911, 0.366599, 0.642197, 0.662921, 0.294118, 0.234146, 0.345309, 0.364407, - 0.352941, 0.288462, 0.547619, 0.712121, 0.53211 , 0.510823, + 0.352941, 0.288462, 0.547619, 0.712121, 0.53211, 0.510823, 0.631579, 0.490132, 0.651007, 0.653846, 0.773931, 0.550265, - 0.56044 , 0.45977 , 0.445743, 0.643761, 0.232422, 0.564171, - 0.443966, 0.638356, 0.314516, 0.276398, 0.51746 , 0.745342, + 0.56044, 0.45977, 0.445743, 0.643761, 0.232422, 0.564171, + 0.443966, 0.638356, 0.314516, 0.276398, 0.51746, 0.745342, 0.366093, 0.471049, 0.205198, 0.643423, 0.581197, 0.453686, 0.587719, 0.305322, 0.473118, 0.582593, 0.595486, 0.573822, - 0.370192, 0.681661, 0.513514, 0.289655, 0.349364, 0.35873 , - 0.571429, 0.28125 , 0.785425, 0.689055, 0.675 , 0.255814, + 0.370192, 0.681661, 0.513514, 0.289655, 0.349364, 0.35873, + 0.571429, 0.28125, 0.785425, 0.689055, 0.675, 0.255814, 0.542553, 0.333333, 0.544643, 0.668831, 0.534483, 0.237838, - 0.372308, 0.548387, 0.227818, 0.33101 , 0.640931, 0.55814 , - 0.355993, 0.643443, 0.333333, 0.323288, 0.634146, 0.352 , + 0.372308, 0.548387, 0.227818, 0.33101, 0.640931, 0.55814, + 0.355993, 0.643443, 0.333333, 0.323288, 0.634146, 0.352, 0.243902, 0.333333, 0.663968, 0.246212, 0.540323, 0.565217, - 0.715847, 0.655431, 0.707736, 0.407792, 0.52924 , 0.511292, - 0.281195, 0.2875 , 0.323296, 0.230392, 0.262222, 0.371245, - 0.206025, 0.438525, 0.660377, 0.30426 , 0.261044, 0.360902, - 0.34 , 0.695 , 0.455462, 0.621891, 0.41003 , 0.349481, + 0.715847, 0.655431, 0.707736, 0.407792, 0.52924, 0.511292, + 0.281195, 0.2875, 0.323296, 0.230392, 0.262222, 0.371245, + 0.206025, 0.438525, 0.660377, 0.30426, 0.261044, 0.360902, + 0.34, 0.695, 0.455462, 0.621891, 0.41003, 0.349481, 0.656863, 0.482143, 0.379032, 0.389474, 0.658451, 0.295181, - 0.745614, 0.23622 , 0.520408, 0.570213, 0.476923, 0.336864, + 0.745614, 0.23622, 0.520408, 0.570213, 0.476923, 0.336864, 0.431227, 0.389381, 0.368601, 0.489879, 0.228682, 0.413636, - 0.511905, 0.34375 , 0.688235, 0.717131, 0.340996, 0.521739, + 0.511905, 0.34375, 0.688235, 0.717131, 0.340996, 0.521739, 0.302752, 0.724551, 0.247059, 0.634551, 0.375204, 0.585271, - 0.488372, 0.707071, 0.698305, 0.384615, 0.620743, 0.43738 , - 0.8 , 0.275 , 0.434783, 0.335079, 0.28789 , 0.242915, + 0.488372, 0.707071, 0.698305, 0.384615, 0.620743, 0.43738, + 0.8, 0.275, 0.434783, 0.335079, 0.28789, 0.242915, 0.657588, 0.664134, 0.255172, 0.636364, 0.349593, 0.849057, 0.371681, 0.535032, 0.383721, 0.597403, 0.694268, 0.588235, - 0.481013, 0.555556, 0.5 , 0.37931 , 0.259843, 0.681259, - 0.545455, 0.550725, 0.338583, 0.3085 , 0.437318, 0.540123, - 0.464481, 0.597598, 0.481928, 0.488889, 0.565543, 0.54549 , - 0.428571, 0.34891 , 0.62069 , 0.656428, 0.480392, 0.738295, - 0.475 , 0.409574, 0.5625 , 0.582677, 0.640572, 0.307263, - 0.414634, 0.641566, 0.486188, 0.679245, 0.566845, 0.53528 , - 0.48318 , 0.422096, 0.362694, 0.261866, 0.207268, 0.756184, + 0.481013, 0.555556, 0.5, 0.37931, 0.259843, 0.681259, + 0.545455, 0.550725, 0.338583, 0.3085, 0.437318, 0.540123, + 0.464481, 0.597598, 0.481928, 0.488889, 0.565543, 0.54549, + 0.428571, 0.34891, 0.62069, 0.656428, 0.480392, 0.738295, + 0.475, 0.409574, 0.5625, 0.582677, 0.640572, 0.307263, + 0.414634, 0.641566, 0.486188, 0.679245, 0.566845, 0.53528, + 0.48318, 0.422096, 0.362694, 0.261866, 0.207268, 0.756184, 0.431373, 0.781818, 0.479079, 0.505464, 0.318792, 0.209524, - 0.55268 , 0.666667, 0.365482, 0.745455, 0.662791, 0.670455, - 0.611511, 0.562197, 0.737557, 0.315315, 0.520661, 0.35514 , - 0.476489, 0.310502, 0.511416, 0.642857, 0.378277, 0.44898 , + 0.55268, 0.666667, 0.365482, 0.745455, 0.662791, 0.670455, + 0.611511, 0.562197, 0.737557, 0.315315, 0.520661, 0.35514, + 0.476489, 0.310502, 0.511416, 0.642857, 0.378277, 0.44898, 0.665635, 0.494518, 0.487179, 0.614568, 0.528736, 0.405263, 0.275556, 0.504808, 0.264591, 0.518817, 0.587156, 0.652278, - 0.551429, 0.497797, 0.54646 , 0.546392, 0.409938, 0.447531, - 0.438247, 0.363636, 0.637631, 0.57037 , 0.276042, 0.554479, - 0.56 , 0.367521, 0.348039, 0.259259, 0.210818, 0.485477, - 0.285714, 0.39801 , 0.546921, 0.603306, 0.521739, 0.409812, - 0.509729, 0.349593, 0.54023 , 0.487805, 0.516468, 0.606383, + 0.551429, 0.497797, 0.54646, 0.546392, 0.409938, 0.447531, + 0.438247, 0.363636, 0.637631, 0.57037, 0.276042, 0.554479, + 0.56, 0.367521, 0.348039, 0.259259, 0.210818, 0.485477, + 0.285714, 0.39801, 0.546921, 0.603306, 0.521739, 0.409812, + 0.509729, 0.349593, 0.54023, 0.487805, 0.516468, 0.606383, 0.746606, 0.549133, 0.394265, 0.776722, 0.380762, 0.661348, 0.493056, 0.293147, 0.566265, 0.241573, 0.257956, 0.577519, - 0.563506, 0.694981, 0.550725, 0.48169 , 0.654391, 0.505208, + 0.563506, 0.694981, 0.550725, 0.48169, 0.654391, 0.505208, 0.241158, 0.530488, 0.223785, 0.689655, 0.507246, 0.421731, 0.449541, 0.651007, 0.272727, 0.642412, 0.398098, 0.281588, 0.429379, 0.647059, 0.716667, 0.668966, 0.727811, 0.608355, - 0.63785 , 0.428571, 0.647376, 0.549223, 0.409091, 0.479365, - 0.317333, 0.49373 , 0.706186, 0.357287, 0.534884, 0.603306, + 0.63785, 0.428571, 0.647376, 0.549223, 0.409091, 0.479365, + 0.317333, 0.49373, 0.706186, 0.357287, 0.534884, 0.603306, 0.630435, 0.506944, 0.653846, 0.385714, 0.362126, 0.439252, - 0.61 , 0.367742, 0.474638, 0.37247 , 0.819549, 0.65762 , - 0.66416 , 0.479532, 0.532338, 0.379562, 0.205524, 0.706564, - 0.466667, 0.555556, 0.70801 , 0.719397, 0.333333, 0.791667, + 0.61, 0.367742, 0.474638, 0.37247, 0.819549, 0.65762, + 0.66416, 0.479532, 0.532338, 0.379562, 0.205524, 0.706564, + 0.466667, 0.555556, 0.70801, 0.719397, 0.333333, 0.791667, 0.831707, 0.537081, 0.434343, 0.349398, 0.367537, 0.371939, - 0.78392 , 0.343612, 0.541667, 0.631016, 0.390625, 0.7 , - 0.566524, 0.305 , 0.472868, 0.693182, 0.413534, 0.653696, + 0.78392, 0.343612, 0.541667, 0.631016, 0.390625, 0.7, + 0.566524, 0.305, 0.472868, 0.693182, 0.413534, 0.653696, 0.220264, 0.397959, 0.371069, 0.214674, 0.495114, 0.719298, - 0.351955, 0.213483, 0.289796, 0.515464, 0.30599 , 0.574257, + 0.351955, 0.213483, 0.289796, 0.515464, 0.30599, 0.574257, 0.617883, 0.376226, 0.408582, 0.339426, 0.671815, 0.333333, 0.364179, 0.541126, 0.633188, 0.506045, 0.702512, 0.355014, - 0.494845, 0.66 , 0.55814 , 0.375 , 0.238889, 0.594444, - 0.597701, 0.37234 }; -const std::vector default_neg_data = {0.029661 , 0.00869565, 0.0830769 , 0.0914513 , 0.00950119, - 0.144928 , 0.0286494 , 0.0164141 , 0.0425532 , 0.032345 , - 0.0186514 , 0.0281899 , 0. , 0.0379651 , 0.0427928 , - 0.0571696 , 0.0196319 , 0.00830484, 0.0254545 , 0.0104167 , - 0.0192781 , 0.0434917 , 0.0165426 , 0.0256917 , 0.0288875 , - 0.0231214 , 0.0139555 , 0.0181997 , 0.0180437 , 0.0222222 , - 0.0177083 , 0.0252525 , 0.0381125 , 0.0156128 , 0.048603 , - 0.0373519 , 0.0379747 , 0.0123748 , 0.0380165 , 0.042953 , - 0.0284091 , 0.0450205 , 0.014876 , 0.0477674 , 0.0108873 , - 0.0485075 , 0.0283688 , 0.0397799 , 0.037058 , 0.0195178 , - 0.0285344 , 0.032835 , 0.0380864 , 0.0271024 , 0. , - 0.0509554 , 0.0440744 , 0.0226349 , 0.0388787 , 0.0307692 , - 0.0197628 , 0.0183486 , 0.0220588 , 0.0194175 , 0.00157233, - 0.0150637 , 0.0372737 , 0.0365034 , 0.0361446 , 0.0120919 , - 0.0185449 , 0.0505495 , 0.0140728 , 0.039557 , 0.0632184 , - 0.0312891 , 0.0134626 , 0.0209111 , 0.0341297 , 0.0286807 , - 0.0166852 , 0.0230701 , 0.0101138 , 0.0255102 , 0.0425532 , - 0.0276442 , 0.0402415 , 0.0368196 , 0.0164688 , 0.0488798 , - 0.0283768 , 0.0268714 , 0.0496157 , 0.0496806 , 0.0158311 , - 0.0191657 , 0.0465511 , 0.0460564 , 0.0228311 , 0.0272793 , - 0.0475241 , 0.0167959 , 0.0221843 , 0.0322767 , 0.0318091 , - 0.0171265 , 0.0280222 , 0.0237781 , 0.0108481 , 0.0288136 , - 0.0408627 , 0.0212766 , 0.0322148 , 0.0272989 , 0.0347826 , - 0.00402901, 0.0259009 , 0.0351459 , 0.00761905, 0.0261438 , - 0.0463215 , 0.0265372 , 0.0582691 , 0.0325203 , 0.0304878 , - 0.0283688 , 0.0220264 , 0.00813638, 0.0259434 , 0.0219928 , - 0.0142292 , 0.0324519 , 0.0206349 , 0.0231729 , 0.0179541 , - 0.0447154 , 0.0357436 , 0.0221402 , 0.0304124 , 0.0559384 , - 0.00854701, 0.0185567 , 0.0107759 , 0.016129 , 0.0297824 , - 0.0211216 , 0.0352645 , 0.046285 , 0.0254181 , 0.0440367 , - 0.0274964 , 0.0229277 , 0.0542232 , 0.00228833, 0.0342029 , - 0.0397839 , 0.0209068 , 0.0293501 , 0.0262069 , 0.0157853 , - 0.0365854 , 0.026362 , 0.0125953 , 0.0194384 , 0.0250284 , - 0.0417973 , 0.0236829 , 0.00610583, 0.0164331 , 0.026455 , - 0.047619 , 0.0228571 , 0.0458221 , 0.0965909 , 0.0348162 , - 0.0821918 , 0.0198265 , 0.026455 , 0.0153846 , 0.0615977 , - 0.0296296 , 0.034555 , 0.0374805 , 0.0253921 , 0.0186992 , - 0.0233236 , 0.0119363 , 0.0366881 , 0.00961538, 0.0453846 , - 0.0459094 , 0.0391798 , 0.0248075 , 0.0330113 , 0.00591716, - 0.0397306 , 0.0323607 , 0.0278035 , 0.0193237 , 0.0242152 , - 0.00440529, 0.0294826 , 0.0188679 , 0.0543933 , 0.0327363 , - 0.0365233 , 0.0515856 , 0.042086 , 0.012907 , 0.0109664 , - 0.0351605 , 0.0251852 , 0.0494148 , 0.0401606 , 0. , - 0.0369686 , 0.0411622 , 0.0257143 , 0.0416667 , 0.0400943 , - 0.0409836 , 0.00296296, 0.0714286 , 0.0270856 , 0.0117521 , - 0.0174472 , 0.0354167 , 0.0237056 , 0.0292683 , 0.0253723 , - 0.025 , 0.0163043 , 0.0229983 , 0.0418502 , 0.0259179 , - 0.0386047 , 0.0355295 , 0.0451613 , 0.0353982 , 0.0310711 , - 0.0250344 , 0.0125604 , 0.0237691 , 0.0406835 , 0.0524476 , - 0.0307692 , 0.0248619 , 0.0264085 , 0.03003 , 0.0219273 , - 0.0214827 , 0.039629 , 0.0393462 , 0.0207164 , 0.0193896 , - 0.0312947 , 0.0485252 , 0.0265004 , 0.0298322 , 0.0178571 , - 0.0333333 , 0.00632911, 0.0328767 , 0.0355556 , 0.0331825 , - 0.0197568 , 0.0413793 , 0.0215983 , 0.0182584 , 0.0211161 , - 0.0290276 , 0.00200803, 0.035284 , 0.0368509 , 0.0279971 , - 0.02334 , 0.0245637 , 0.00180505, 0.0298013 , 0.0173069 , - 0.0167254 , 0.0321569 , 0.0357616 , 0.0438791 , 0.024147 , - 0.0399344 , 0.0285962 , 0.00518623, 0.0311218 , 0.0288498 , - 0.0236686 , 0.012583 , 0.0243531 , 0.0218015 , 0.0121639 , - 0.041868 , 0.0243223 , 0.0260317 , 0.0256233 , 0.0325203 , - 0.024186 , 0.0125116 , 0.0138889 , 0.0221271 , 0.0445293 , - 0.0166667 , 0.0292373 , 0.00485437, 0.0443599 , 0.0174766 , - 0.023511 , 0.0411594 , 0.0189873 , 0.0137931 , 0.0388889 , - 0.0490798 , 0.0394737 , 0.0539568 , 0.0342466 , 0.0679612 , - 0.0250696 , 0.03125 , 0.0365034 , 0.0265252 , 0.060241 , - 0.0166008 , 0.0295602 , 0.0162665 , 0.0100925 , 0.0506757 , - 0.0150754 , 0.0737705 , 0.0371367 , 0.026362 , 0.0289958 , - 0.0297619 , 0.0301003 , 0.0219895 , 0.0134804 , 0.0175439 , - 0.0443319 , 0.0137457 , 0.0286561 , 0.0546875 , 0.0655738 , - 0.0326741 , 0.0251232 , 0.0363743 , 0.0313901 , 0.0442404 , - 0.0262935 , 0.019601 , 0.0291751 , 0.0316978 , 0.0287356 , - 0.0226586 , 0.0341207 , 0.00504356, 0.0359589 , 0.0232751 , - 0.0324374 , 0.0125962 , 0.0216401 , 0.0282589 , 0.0180505 , - 0.0345912 , 0.0228188 , 0.0260374 , 0.0195787 , 0.00486618, - 0.0508732 , 0.0280469 , 0.0224719 , 0.0320856 , 0.0444444 , - 0.0365535 , 0.018797 , 0.0759494 , 0.072 , 0.0235199 , - 0.0193548 , 0.0453202 , 0.0414972 , 0.019802 , 0.019802 , - 0.0317003 , 0.0341047 , 0.025544 , 0.0518672 , 0.0271859 , - 0.0528355 , 0.02398 , 0.030303 , 0.0219146 , 0.0397456 , - 0.010582 , 0.0218182 , 0.010846 , 0.0348174 , 0.032967 , - 0.011117 , 0.0404494 , 0.0176829 , 0.052 , 0.0307692 , - 0.0404355 , 0.0392216 , 0.00652667, 0.0440465 , 0.0325444 , - 0.0322581 , 0.0220096 , 0.030146 , 0.0356802 , 0.0349938 , - 0.0285036 , 0.0176194 , 0.0571429 , 0.0108509 , 0.0279416 , - 0.0334225 , 0.0136482 , 0.0535294 , 0.0213592 , 0.027933 , - 0.0296296 , 0.0430925 , 0.0200236 , 0.0278884 , 0.0463054 , - 0.0368098 , 0.0205479 , 0.027833 , 0.0247748 , 0.0292398 , - 0.0149489 , 0.0326087 , 0.024183 , 0.0265487 , 0.0101597 , - 0.028125 , 0.0333333 , 0.024296 , 0.0395074 , 0.0255272 , - 0.0282575 , 0.0175867 , 0.0225443 , 0.0253317 , 0.0226131 , - 0.0478723 , 0.0238837 , 0.0229253 , 0.0208153 , 0.0377277 , - 0.0350765 , 0.0526316 , 0.0409556 , 0.0325581 , 0.0384906 , - 0.0438909 , 0.0250765 , 0.0383352 , 0.021057 , 0.0140392 , - 0.0248447 , 0.0277778 , 0.0128023 , 0.0438523 , 0.0338753 , - 0.0164701 , 0.0372493 , 0.0367422 , 0.0209832 , 0.0272031 , - 0.0305114 , 0.0208228 , 0.0246575 , 0.0269278 , 0.034039 , - 0.0348558 , 0.0205266 , 0.0447761 , 0.0470446 , 0.028169 , - 0.0107239 , 0.021978 , 0.00425532, 0.0169492 , 0.0470163 , - 0.0326693 , 0.0257069 , 0.0371163 , 0.0258519 , 0.0670444 , - 0.0401786 , 0.00708215, 0.0335475 , 0.030137 , 0.0210084}; - -void setup_dehost_subcommand(CLI::App& app) -{ + 0.494845, 0.66, 0.55814, 0.375, 0.238889, 0.594444, + 0.597701, 0.37234}; +const std::vector default_neg_data = {0.029661, 0.00869565, 0.0830769, 0.0914513, 0.00950119, + 0.144928, 0.0286494, 0.0164141, 0.0425532, 0.032345, + 0.0186514, 0.0281899, 0., 0.0379651, 0.0427928, + 0.0571696, 0.0196319, 0.00830484, 0.0254545, 0.0104167, + 0.0192781, 0.0434917, 0.0165426, 0.0256917, 0.0288875, + 0.0231214, 0.0139555, 0.0181997, 0.0180437, 0.0222222, + 0.0177083, 0.0252525, 0.0381125, 0.0156128, 0.048603, + 0.0373519, 0.0379747, 0.0123748, 0.0380165, 0.042953, + 0.0284091, 0.0450205, 0.014876, 0.0477674, 0.0108873, + 0.0485075, 0.0283688, 0.0397799, 0.037058, 0.0195178, + 0.0285344, 0.032835, 0.0380864, 0.0271024, 0., + 0.0509554, 0.0440744, 0.0226349, 0.0388787, 0.0307692, + 0.0197628, 0.0183486, 0.0220588, 0.0194175, 0.00157233, + 0.0150637, 0.0372737, 0.0365034, 0.0361446, 0.0120919, + 0.0185449, 0.0505495, 0.0140728, 0.039557, 0.0632184, + 0.0312891, 0.0134626, 0.0209111, 0.0341297, 0.0286807, + 0.0166852, 0.0230701, 0.0101138, 0.0255102, 0.0425532, + 0.0276442, 0.0402415, 0.0368196, 0.0164688, 0.0488798, + 0.0283768, 0.0268714, 0.0496157, 0.0496806, 0.0158311, + 0.0191657, 0.0465511, 0.0460564, 0.0228311, 0.0272793, + 0.0475241, 0.0167959, 0.0221843, 0.0322767, 0.0318091, + 0.0171265, 0.0280222, 0.0237781, 0.0108481, 0.0288136, + 0.0408627, 0.0212766, 0.0322148, 0.0272989, 0.0347826, + 0.00402901, 0.0259009, 0.0351459, 0.00761905, 0.0261438, + 0.0463215, 0.0265372, 0.0582691, 0.0325203, 0.0304878, + 0.0283688, 0.0220264, 0.00813638, 0.0259434, 0.0219928, + 0.0142292, 0.0324519, 0.0206349, 0.0231729, 0.0179541, + 0.0447154, 0.0357436, 0.0221402, 0.0304124, 0.0559384, + 0.00854701, 0.0185567, 0.0107759, 0.016129, 0.0297824, + 0.0211216, 0.0352645, 0.046285, 0.0254181, 0.0440367, + 0.0274964, 0.0229277, 0.0542232, 0.00228833, 0.0342029, + 0.0397839, 0.0209068, 0.0293501, 0.0262069, 0.0157853, + 0.0365854, 0.026362, 0.0125953, 0.0194384, 0.0250284, + 0.0417973, 0.0236829, 0.00610583, 0.0164331, 0.026455, + 0.047619, 0.0228571, 0.0458221, 0.0965909, 0.0348162, + 0.0821918, 0.0198265, 0.026455, 0.0153846, 0.0615977, + 0.0296296, 0.034555, 0.0374805, 0.0253921, 0.0186992, + 0.0233236, 0.0119363, 0.0366881, 0.00961538, 0.0453846, + 0.0459094, 0.0391798, 0.0248075, 0.0330113, 0.00591716, + 0.0397306, 0.0323607, 0.0278035, 0.0193237, 0.0242152, + 0.00440529, 0.0294826, 0.0188679, 0.0543933, 0.0327363, + 0.0365233, 0.0515856, 0.042086, 0.012907, 0.0109664, + 0.0351605, 0.0251852, 0.0494148, 0.0401606, 0., + 0.0369686, 0.0411622, 0.0257143, 0.0416667, 0.0400943, + 0.0409836, 0.00296296, 0.0714286, 0.0270856, 0.0117521, + 0.0174472, 0.0354167, 0.0237056, 0.0292683, 0.0253723, + 0.025, 0.0163043, 0.0229983, 0.0418502, 0.0259179, + 0.0386047, 0.0355295, 0.0451613, 0.0353982, 0.0310711, + 0.0250344, 0.0125604, 0.0237691, 0.0406835, 0.0524476, + 0.0307692, 0.0248619, 0.0264085, 0.03003, 0.0219273, + 0.0214827, 0.039629, 0.0393462, 0.0207164, 0.0193896, + 0.0312947, 0.0485252, 0.0265004, 0.0298322, 0.0178571, + 0.0333333, 0.00632911, 0.0328767, 0.0355556, 0.0331825, + 0.0197568, 0.0413793, 0.0215983, 0.0182584, 0.0211161, + 0.0290276, 0.00200803, 0.035284, 0.0368509, 0.0279971, + 0.02334, 0.0245637, 0.00180505, 0.0298013, 0.0173069, + 0.0167254, 0.0321569, 0.0357616, 0.0438791, 0.024147, + 0.0399344, 0.0285962, 0.00518623, 0.0311218, 0.0288498, + 0.0236686, 0.012583, 0.0243531, 0.0218015, 0.0121639, + 0.041868, 0.0243223, 0.0260317, 0.0256233, 0.0325203, + 0.024186, 0.0125116, 0.0138889, 0.0221271, 0.0445293, + 0.0166667, 0.0292373, 0.00485437, 0.0443599, 0.0174766, + 0.023511, 0.0411594, 0.0189873, 0.0137931, 0.0388889, + 0.0490798, 0.0394737, 0.0539568, 0.0342466, 0.0679612, + 0.0250696, 0.03125, 0.0365034, 0.0265252, 0.060241, + 0.0166008, 0.0295602, 0.0162665, 0.0100925, 0.0506757, + 0.0150754, 0.0737705, 0.0371367, 0.026362, 0.0289958, + 0.0297619, 0.0301003, 0.0219895, 0.0134804, 0.0175439, + 0.0443319, 0.0137457, 0.0286561, 0.0546875, 0.0655738, + 0.0326741, 0.0251232, 0.0363743, 0.0313901, 0.0442404, + 0.0262935, 0.019601, 0.0291751, 0.0316978, 0.0287356, + 0.0226586, 0.0341207, 0.00504356, 0.0359589, 0.0232751, + 0.0324374, 0.0125962, 0.0216401, 0.0282589, 0.0180505, + 0.0345912, 0.0228188, 0.0260374, 0.0195787, 0.00486618, + 0.0508732, 0.0280469, 0.0224719, 0.0320856, 0.0444444, + 0.0365535, 0.018797, 0.0759494, 0.072, 0.0235199, + 0.0193548, 0.0453202, 0.0414972, 0.019802, 0.019802, + 0.0317003, 0.0341047, 0.025544, 0.0518672, 0.0271859, + 0.0528355, 0.02398, 0.030303, 0.0219146, 0.0397456, + 0.010582, 0.0218182, 0.010846, 0.0348174, 0.032967, + 0.011117, 0.0404494, 0.0176829, 0.052, 0.0307692, + 0.0404355, 0.0392216, 0.00652667, 0.0440465, 0.0325444, + 0.0322581, 0.0220096, 0.030146, 0.0356802, 0.0349938, + 0.0285036, 0.0176194, 0.0571429, 0.0108509, 0.0279416, + 0.0334225, 0.0136482, 0.0535294, 0.0213592, 0.027933, + 0.0296296, 0.0430925, 0.0200236, 0.0278884, 0.0463054, + 0.0368098, 0.0205479, 0.027833, 0.0247748, 0.0292398, + 0.0149489, 0.0326087, 0.024183, 0.0265487, 0.0101597, + 0.028125, 0.0333333, 0.024296, 0.0395074, 0.0255272, + 0.0282575, 0.0175867, 0.0225443, 0.0253317, 0.0226131, + 0.0478723, 0.0238837, 0.0229253, 0.0208153, 0.0377277, + 0.0350765, 0.0526316, 0.0409556, 0.0325581, 0.0384906, + 0.0438909, 0.0250765, 0.0383352, 0.021057, 0.0140392, + 0.0248447, 0.0277778, 0.0128023, 0.0438523, 0.0338753, + 0.0164701, 0.0372493, 0.0367422, 0.0209832, 0.0272031, + 0.0305114, 0.0208228, 0.0246575, 0.0269278, 0.034039, + 0.0348558, 0.0205266, 0.0447761, 0.0470446, 0.028169, + 0.0107239, 0.021978, 0.00425532, 0.0169492, 0.0470163, + 0.0326693, 0.0257069, 0.0371163, 0.0258519, 0.0670444, + 0.0401786, 0.00708215, 0.0335475, 0.030137, 0.0210084}; + +void setup_dehost_subcommand(CLI::App &app) { auto opt = std::make_shared(); - auto* dehost_subcommand = app.add_subcommand( - "dehost", "Dehost read file into host and other using index."); + auto *dehost_subcommand = app.add_subcommand( + "dehost", "Dehost read file into host and other using index."); dehost_subcommand->add_option("", opt->read_file, "Fasta/q file") - ->required() - ->transform(make_absolute) - ->check(CLI::ExistingFile.description("")) - ->type_name("FILE"); + ->required() + ->transform(make_absolute) + ->check(CLI::ExistingFile.description("")) + ->type_name("FILE"); dehost_subcommand->add_option("", opt->read_file2, "Paired Fasta/q file") ->transform(make_absolute) @@ -227,7 +226,8 @@ void setup_dehost_subcommand(CLI::App& app) ->required() ->check(CLI::ExistingPath.description("")); - dehost_subcommand->add_option("-e,--extract", opt->category_to_extract, "Reads from this category in the index will be extracted to file.") + dehost_subcommand->add_option("-e,--extract", opt->category_to_extract, + "Reads from this category in the index will be extracted to file.") ->type_name("STRING"); dehost_subcommand->add_option("-p,--prefix", opt->prefix, "Prefix for the output files.") @@ -236,17 +236,20 @@ void setup_dehost_subcommand(CLI::App& app) ->default_str(""); dehost_subcommand - ->add_option("--chunk_size", opt->chunk_size, "Read file is read in chunks of this size, to be processed in parallel within a chunk.") + ->add_option("--chunk_size", opt->chunk_size, + "Read file is read in chunks of this size, to be processed in parallel within a chunk.") ->type_name("INT") ->capture_default_str(); dehost_subcommand - ->add_option("--lo_hi_threshold", opt->lo_hi_threshold, "Threshold used during model fitting stage to decide if read should be used to train lo or hi distribution.") + ->add_option("--lo_hi_threshold", opt->lo_hi_threshold, + "Threshold used during model fitting stage to decide if read should be used to train lo or hi distribution.") ->type_name("FLOAT") ->capture_default_str(); dehost_subcommand - ->add_option("--num_reads_to_fit", opt->num_reads_to_fit, "Number of reads to use to train each distribution in the model.") + ->add_option("--num_reads_to_fit", opt->num_reads_to_fit, + "Number of reads to use to train each distribution in the model.") ->type_name("INT") ->capture_default_str(); @@ -264,26 +267,31 @@ void setup_dehost_subcommand(CLI::App& app) ->capture_default_str(); dehost_subcommand - ->add_option("--min_compression", opt->min_compression, "Minimum read gzip compression ratio to classify (a measure of how much information is in the read.") + ->add_option("--min_compression", opt->min_compression, + "Minimum read gzip compression ratio to classify (a measure of how much information is in the read.") ->type_name("FLOAT") ->capture_default_str(); dehost_subcommand - ->add_option("--confidence", opt->confidence_threshold, "Minimum difference between the top 2 unique hit counts.") + ->add_option("--confidence", opt->confidence_threshold, + "Minimum difference between the top 2 unique hit counts.") ->type_name("INT") ->capture_default_str(); dehost_subcommand - ->add_option("--host_unique_prop_lo_threshold", opt->host_unique_prop_lo_threshold, "Require non-host reads to have unique host proportion below this threshold for classification.") + ->add_option("--host_unique_prop_lo_threshold", opt->host_unique_prop_lo_threshold, + "Require non-host reads to have unique host proportion below this threshold for classification.") ->type_name("INT") ->capture_default_str(); dehost_subcommand - ->add_option("--min_proportion_diff", opt->min_proportion_difference, "Minimum difference between the proportion of (non-unique) kmers found in each category.") + ->add_option("--min_proportion_diff", opt->min_proportion_difference, + "Minimum difference between the proportion of (non-unique) kmers found in each category.") ->type_name("FLOAT") ->capture_default_str(); dehost_subcommand - ->add_option("--min_probability_diff", opt->min_prob_difference, "Minimum difference between the probability found in each category.") + ->add_option("--min_probability_diff", opt->min_prob_difference, + "Minimum difference between the probability found in each category.") ->type_name("FLOAT") ->capture_default_str(); @@ -297,16 +305,17 @@ void setup_dehost_subcommand(CLI::App& app) ->capture_default_str(); dehost_subcommand->add_flag( - "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); + "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); // Set the function that will be called when this subcommand is issued. dehost_subcommand->callback([opt]() { dehost_main(*opt); }); } -void dehost_reads(const DehostArguments& opt, const Index& index){ +void dehost_reads(const DehostArguments &opt, const Index &index) { PLOG_INFO << "Dehosting file " << opt.read_file; - auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, seqan3::window_size{index.window_size()}); + auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, + seqan3::window_size{index.window_size()}); PLOG_VERBOSE << "Defined hash_adaptor"; auto agent = index.agent(); @@ -323,41 +332,40 @@ void dehost_reads(const DehostArguments& opt, const Index& index){ PLOG_DEBUG << "Defined Result with " << +index.num_bins() << " bins"; - for (auto && chunk : fin | seqan3::views::chunk(opt.chunk_size)) - { + for (auto &&chunk: fin | seqan3::views::chunk(opt.chunk_size)) { // You can use a for loop: - for (auto & record : chunk) - { + for (auto &record: chunk) { records.push_back(std::move(record)); } #pragma omp parallel for firstprivate(agent, hash_adaptor) num_threads(opt.threads) shared(result) - for (auto i=0; i std::numeric_limits::max()){ + if (read_length > std::numeric_limits::max()) { PLOG_WARNING << "Ignoring read " << record.id() << " as too long!"; continue; } - if (read_length == 0){ + if (read_length == 0) { PLOG_WARNING << "Ignoring read " << record.id() << " as has zero length!"; continue; } - auto qualities = record.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); + auto qualities = record.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); auto sum = std::accumulate(qualities.begin(), qualities.end(), 0); float mean_quality = 0; if (std::ranges::size(qualities) > 0) - mean_quality = static_cast< float >( sum )/ static_cast< float >(std::ranges::size(qualities)); + mean_quality = static_cast< float >( sum ) / static_cast< float >(std::ranges::size(qualities)); PLOG_VERBOSE << "Mean quality of read " << record.id() << " is " << mean_quality; float compression_ratio = get_compression_ratio(sequence_to_string(record.sequence())); PLOG_VERBOSE << "Found compression ratio of read " << record.id() << " is " << compression_ratio; auto read = ReadEntry(read_id, read_length, mean_quality, compression_ratio, result.input_summary()); - for (auto && value : record.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } PLOG_VERBOSE << "Finished adding raw hash counts for read " << read_id; @@ -373,10 +381,11 @@ void dehost_reads(const DehostArguments& opt, const Index& index){ } -void dehost_paired_reads(const DehostArguments& opt, const Index& index){ +void dehost_paired_reads(const DehostArguments &opt, const Index &index) { PLOG_INFO << "Dehosting files " << opt.read_file << " and " << opt.read_file2; - auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, seqan3::window_size{index.window_size()}); + auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{index.kmer_size()}}, + seqan3::window_size{index.window_size()}); PLOG_VERBOSE << "Defined hash_adaptor"; auto agent = index.agent(); @@ -395,50 +404,50 @@ void dehost_paired_reads(const DehostArguments& opt, const Index& index){ PLOG_DEBUG << "Defined Result with " << +index.num_bins() << " bins"; - for (auto && chunk : fin1 | seqan3::views::chunk(opt.chunk_size)) - { - for (auto & record : chunk) - { + for (auto &&chunk: fin1 | seqan3::views::chunk(opt.chunk_size)) { + for (auto &record: chunk) { records1.push_back(std::move(record)); } // loop in the second file and get same amount of reads - for ( auto& record2 : fin2 | std::views::take( opt.chunk_size ) ) - { - records2.push_back( std::move( record2 )); + for (auto &record2: fin2 | std::views::take(opt.chunk_size)) { + records2.push_back(std::move(record2)); } #pragma omp parallel for firstprivate(agent, hash_adaptor) num_threads(opt.threads) shared(result) - for (auto i=0; i std::numeric_limits::max()){ + if (read_length > std::numeric_limits::max()) { PLOG_WARNING << "Ignoring read " << record1.id() << " as too long!"; continue; } - if (read_length == 0){ + if (read_length == 0) { PLOG_WARNING << "Ignoring read " << record1.id() << " as has zero length!"; continue; } - auto qualities1 = record1.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); - auto qualities2 = record2.base_qualities() | std::views::transform( [](auto quality) { return seqan3::to_phred(quality); }); + auto qualities1 = record1.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); + auto qualities2 = record2.base_qualities() | + std::views::transform([](auto quality) { return seqan3::to_phred(quality); }); auto sum = std::accumulate(qualities1.begin(), qualities1.end(), 0); sum = std::accumulate(qualities2.begin(), qualities2.end(), sum); float mean_quality = 0; if (std::ranges::size(qualities1) + std::ranges::size(qualities2) > 0) - mean_quality = static_cast< float >( sum )/ static_cast< float >(std::ranges::size(qualities1) + std::ranges::size(qualities2)); + mean_quality = static_cast< float >( sum ) / + static_cast< float >(std::ranges::size(qualities1) + std::ranges::size(qualities2)); PLOG_VERBOSE << "Mean quality of read " << record1.id() << " is " << mean_quality; auto combined_record = sequence_to_string(record1.sequence()) + sequence_to_string(record2.sequence()); @@ -446,12 +455,12 @@ void dehost_paired_reads(const DehostArguments& opt, const Index& index){ PLOG_VERBOSE << "Found compression ratio of read " << record1.id() << " is " << compression_ratio; auto read = ReadEntry(read_id, read_length, mean_quality, compression_ratio, result.input_summary()); - for (auto && value : record1.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record1.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } - for (auto && value : record2.sequence() | hash_adaptor) { - const auto & entry = agent.bulk_contains(value); + for (auto &&value: record2.sequence() | hash_adaptor) { + const auto &entry = agent.bulk_contains(value); read.update_entry(entry); } PLOG_VERBOSE << "Finished adding raw hash counts for read " << read_id; @@ -468,8 +477,7 @@ void dehost_paired_reads(const DehostArguments& opt, const Index& index){ } -int dehost_main(DehostArguments & opt) -{ +int dehost_main(DehostArguments &opt) { auto log_level = plog::info; if (opt.verbosity == 1) { log_level = plog::debug; @@ -497,14 +505,14 @@ int dehost_main(DehostArguments & opt) opt.run_extract = (opt.category_to_extract != ""); const auto categories = index.categories(); - if (opt.run_extract and opt.category_to_extract != "all" and std::find(categories.begin(), categories.end(), opt.category_to_extract) == categories.end()) - { + if (opt.run_extract and opt.category_to_extract != "all" and + std::find(categories.begin(), categories.end(), opt.category_to_extract) == categories.end()) { std::string options = ""; for (auto i: categories) options += i + " "; PLOG_ERROR << "Cannot extract " << opt.category_to_extract << ", please chose one of [ all " << options << "]"; return 1; - } else if (opt.run_extract){ + } else if (opt.run_extract) { if (opt.prefix == "") opt.prefix = "charon"; std::vector to_extract; @@ -514,19 +522,20 @@ int dehost_main(DehostArguments & opt) to_extract.push_back(opt.category_to_extract); const auto extension = get_extension(opt.read_file); - for (const auto & category : to_extract){ + for (const auto &category: to_extract) { const auto category_index = index.get_category_index(category); if (opt.is_paired) { - opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + "_1" + extension + ".gz"); - opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + "_2" + extension + ".gz"); + opt.extract_category_to_file[category_index].push_back( + opt.prefix + "_" + category + "_1" + extension + ".gz"); + opt.extract_category_to_file[category_index].push_back( + opt.prefix + "_" + category + "_2" + extension + ".gz"); } else { opt.extract_category_to_file[category_index].push_back(opt.prefix + "_" + category + extension + ".gz"); } } } - if (opt.dist != "gamma" and opt.dist != "beta" and opt.dist != "kde") - { + if (opt.dist != "gamma" and opt.dist != "beta" and opt.dist != "kde") { PLOG_ERROR << "Supported distributions are [gamma , beta, kde]"; return 1; } diff --git a/src/index_main.cpp b/src/index_main.cpp index 0b89d6e..ed5dba4 100644 --- a/src/index_main.cpp +++ b/src/index_main.cpp @@ -20,37 +20,37 @@ #include -void setup_index_subcommand(CLI::App& app) -{ +void setup_index_subcommand(CLI::App &app) { auto opt = std::make_shared(); - auto* index_subcommand = app.add_subcommand( - "index", "Build an index (IBF) for a number of references split into a small number of bins."); + auto *index_subcommand = app.add_subcommand( + "index", "Build an index (IBF) for a number of references split into a small number of bins."); - index_subcommand->add_option("", opt->input_file, "Tab separated file with columns for filename and category") - ->required() - ->transform(make_absolute) - ->check(CLI::ExistingFile.description("")) - ->type_name("FILE"); + index_subcommand->add_option("", opt->input_file, + "Tab separated file with columns for filename and category") + ->required() + ->transform(make_absolute) + ->check(CLI::ExistingFile.description("")) + ->type_name("FILE"); index_subcommand - ->add_option( - "-w", opt->window_size, "Window size for (w,k,s)-minimers (must be <=k).") - ->type_name("INT") - ->capture_default_str(); + ->add_option( + "-w", opt->window_size, "Window size for (w,k,s)-minimers (must be <=k).") + ->type_name("INT") + ->capture_default_str(); index_subcommand->add_option("-k", opt->kmer_size, "K-mer size for (w,k,s)-minimers.") - ->type_name("INT") - ->capture_default_str(); + ->type_name("INT") + ->capture_default_str(); index_subcommand - ->add_option("-t,--threads", opt->threads, "Maximum number of threads to use.") - ->type_name("INT") - ->capture_default_str(); + ->add_option("-t,--threads", opt->threads, "Maximum number of threads to use.") + ->type_name("INT") + ->capture_default_str(); index_subcommand->add_option("-p,--prefix", opt->prefix, "Prefix for the output index.") - ->type_name("FILE") - ->check(CLI::NonexistentPath.description("")) - ->default_str(""); + ->type_name("FILE") + ->check(CLI::NonexistentPath.description("")) + ->default_str(""); index_subcommand->add_option("--temp", opt->tmp_dir, "Temporary directory for index construction files.") ->type_name("DIR") @@ -64,7 +64,7 @@ void setup_index_subcommand(CLI::App& app) "--optimize", opt->optimize, "Compress the number of bins for improved classification run time"); index_subcommand->add_flag( - "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); + "-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity"); // Set the function that will be called when this subcommand is issued. @@ -72,7 +72,7 @@ void setup_index_subcommand(CLI::App& app) } -InputSummary parse_input_file(const std::filesystem::path& input_file){ +InputSummary parse_input_file(const std::filesystem::path &input_file) { PLOG_INFO << "Parsing input file " << input_file; InputSummary summary; uint8_t next_bin = 0; @@ -86,8 +86,7 @@ InputSummary parse_input_file(const std::filesystem::path& input_file){ } std::string line; - while (std::getline(input_ifstream, line)) - { + while (std::getline(input_ifstream, line)) { if (!line.empty()) { auto parts = split(line, "\t"); if (parts.size() >= 2) { @@ -96,8 +95,9 @@ InputSummary parse_input_file(const std::filesystem::path& input_file){ summary.bin_to_category[next_bin] = name; categories.insert(name); summary.filepath_to_bin.emplace_back(std::make_pair(path, next_bin)); - if (next_bin == std::numeric_limits::max()){ - PLOG_WARNING << "User has reached the maximum number of files which is " << +next_bin << " - ignoring any additional lines!"; + if (next_bin == std::numeric_limits::max()) { + PLOG_WARNING << "User has reached the maximum number of files which is " << +next_bin + << " - ignoring any additional lines!"; break; } next_bin++; @@ -109,15 +109,16 @@ InputSummary parse_input_file(const std::filesystem::path& input_file){ summary.num_bins = next_bin; summary.categories.insert(summary.categories.end(), categories.begin(), categories.end()); - PLOG_INFO << "Found " << summary.filepath_to_bin.size() << " files corresponding to " << +summary.num_categories() << " categories"; + PLOG_INFO << "Found " << summary.filepath_to_bin.size() << " files corresponding to " << +summary.num_categories() + << " categories"; return summary; } -InputStats count_and_store_hashes(const IndexArguments& opt, const InputSummary& summary) -{ +InputStats count_and_store_hashes(const IndexArguments &opt, const InputSummary &summary) { PLOG_INFO << "Extracting hashes from files"; - const auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{opt.kmer_size}}, seqan3::window_size{opt.window_size}); + const auto hash_adaptor = seqan3::views::minimiser_hash(seqan3::shape{seqan3::ungapped{opt.kmer_size}}, + seqan3::window_size{opt.window_size}); PLOG_DEBUG << "Defined hash adaptor"; InputStats stats; PLOG_DEBUG << "Defined stats"; @@ -126,9 +127,9 @@ InputStats count_and_store_hashes(const IndexArguments& opt, const InputSummary& PLOG_INFO << "Maximum hashes permitted per bin for fpr rate " << opt.max_fpr << " is " << max_num_hashes; #pragma omp parallel for num_threads(opt.threads) - for (const auto pair : summary.filepath_to_bin) { - const auto& fasta_file = pair.first; - const auto& bin = pair.second; + for (const auto pair: summary.filepath_to_bin) { + const auto &fasta_file = pair.first; + const auto &bin = pair.second; stats.records_per_bin[bin] += 0; PLOG_DEBUG << "Adding file " << fasta_file; @@ -138,33 +139,35 @@ InputStats count_and_store_hashes(const IndexArguments& opt, const InputSummary& auto record_count = 0; std::unordered_set hashes; - for (const auto & record : fin){ + for (const auto &record: fin) { stats.records_per_bin[bin] += 1; const auto mh = record.sequence() | hash_adaptor | std::views::common; - hashes.insert( mh.begin(), mh.end() ); + hashes.insert(mh.begin(), mh.end()); record_count++; } - store_hashes( std::to_string(bin), hashes, opt.tmp_dir ); + store_hashes(std::to_string(bin), hashes, opt.tmp_dir); stats.hashes_per_bin[bin] += hashes.size(); - PLOG_INFO << "Added file " << fasta_file << " with " << record_count << " records and " << hashes.size() << " hashes to bin " << +bin; + PLOG_INFO << "Added file " << fasta_file << " with " << record_count << " records and " << hashes.size() + << " hashes to bin " << +bin; - if (stats.hashes_per_bin[bin] > max_num_hashes){ - PLOG_WARNING << "File " << fasta_file << " with " << hashes.size() << " will exceed max_fpr " << opt.max_fpr; + if (stats.hashes_per_bin[bin] > max_num_hashes) { + PLOG_WARNING + << "File " << fasta_file << " with " << hashes.size() << " will exceed max_fpr " << opt.max_fpr; } } return stats; } -std::unordered_map> optimize_layout(const IndexArguments& opt, InputSummary& summary, InputStats& stats) -{ +std::unordered_map> +optimize_layout(const IndexArguments &opt, InputSummary &summary, InputStats &stats) { std::unordered_map bin_to_bucket_map; std::unordered_map> bucket_to_bins_map; if (stats.hashes_per_bin.size() == 0) { return bucket_to_bins_map; - } else if (not opt.optimize){ - for ( auto bin=0; bin> optimize_layout(const IndexArg // update stats PLOG_INFO << "Reassign bins"; - for (const auto & pair : sorted_pairs){ - const auto& bin = pair.first; + for (const auto &pair: sorted_pairs) { + const auto &bin = pair.first; PLOG_DEBUG << "Reassign bin " << +bin; - assert( summary.bin_to_category.find(bin) != summary.bin_to_category.end() ); - const auto& category = summary.bin_to_category.at(bin); - const auto& num_hashes = pair.second; + assert(summary.bin_to_category.find(bin) != summary.bin_to_category.end()); + const auto &category = summary.bin_to_category.at(bin); + const auto &num_hashes = pair.second; auto assigned_bucket = next_bin; - if (last_bin.find(category) != last_bin.end()){ - const auto& last_bin_in_category = last_bin[category]; - const auto& num_hashes_in_last_bin = new_stats.hashes_per_bin[last_bin_in_category]; - if (num_hashes_in_last_bin + num_hashes < max_num_hashes){ + if (last_bin.find(category) != last_bin.end()) { + const auto &last_bin_in_category = last_bin[category]; + const auto &num_hashes_in_last_bin = new_stats.hashes_per_bin[last_bin_in_category]; + if (num_hashes_in_last_bin + num_hashes < max_num_hashes) { assigned_bucket = last_bin_in_category; } else { next_bin++; @@ -207,8 +210,9 @@ std::unordered_map> optimize_layout(const IndexArg bin_to_bucket_map[bin] = assigned_bucket; bucket_to_bins_map[assigned_bucket].push_back(bin); new_stats.hashes_per_bin[assigned_bucket] += num_hashes; - PLOG_INFO << "Bin " << +bin << " assigned to bucket " << +assigned_bucket << " which now has " << new_stats.hashes_per_bin[assigned_bucket] << " hashes"; - assert( stats.records_per_bin.find(bin) != stats.records_per_bin.end() ); + PLOG_INFO << "Bin " << +bin << " assigned to bucket " << +assigned_bucket << " which now has " + << new_stats.hashes_per_bin[assigned_bucket] << " hashes"; + assert(stats.records_per_bin.find(bin) != stats.records_per_bin.end()); new_stats.records_per_bin[assigned_bucket] += stats.records_per_bin.at(bin); } stats = new_stats; @@ -217,13 +221,13 @@ std::unordered_map> optimize_layout(const IndexArg PLOG_INFO << "Update summary"; new_summary.num_bins = next_bin; new_summary.categories = summary.categories; - for (const auto & pair : summary.filepath_to_bin){ - const auto & filepath = pair.first; - const auto & bin = pair.second; - PLOG_DEBUG << "Update summary for bin " << +bin; - const auto & bucket = bin_to_bucket_map[bin]; + for (const auto &pair: summary.filepath_to_bin) { + const auto &filepath = pair.first; + const auto &bin = pair.second; + PLOG_DEBUG << "Update summary for bin " << +bin; + const auto &bucket = bin_to_bucket_map[bin]; new_summary.filepath_to_bin.emplace_back(std::make_pair(filepath, bucket)); - assert( summary.bin_to_category.find(bin) != summary.bin_to_category.end() ); + assert(summary.bin_to_category.find(bin) != summary.bin_to_category.end()); new_summary.bin_to_category[bucket] = summary.bin_to_category.at(bin); } summary = new_summary; @@ -231,36 +235,34 @@ std::unordered_map> optimize_layout(const IndexArg return bucket_to_bins_map; } -Index build_index(const IndexArguments& opt, const InputSummary& summary, InputStats& stats, const std::unordered_map>& bucket_to_bins_map) -{ +Index build_index(const IndexArguments &opt, const InputSummary &summary, InputStats &stats, + const std::unordered_map> &bucket_to_bins_map) { const auto max_num_hashes = stats.max_num_hashes(); const auto num_bits = bin_size_in_bits(opt, max_num_hashes); PLOG_INFO << "Create new IBF with " << +summary.num_bins << " bins and " << +num_bits << " bits"; seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{summary.num_bins}, - seqan3::bin_size{num_bits}, - seqan3::hash_function_count{opt.num_hash}}; + seqan3::bin_size{num_bits}, + seqan3::hash_function_count{opt.num_hash}}; #pragma omp parallel for - for ( uint8_t bucket=0; bucket -void load_index(Index & index, std::filesystem::path const & path) -{ +void load_index(Index &index, std::filesystem::path const &path) { PLOG_INFO << "Loading index from file " << path; std::ifstream is{path, std::ios::binary}; cereal::BinaryInputArchive iarchive{is}; diff --git a/src/main.cpp b/src/main.cpp index 3577b75..78ab2d7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -9,8 +9,7 @@ class MyFormatter : public CLI::Formatter { public: - std::string make_option_opts(const CLI::Option* opt) const override - { + std::string make_option_opts(const CLI::Option *opt) const override { std::stringstream out; if (opt->get_type_size() != 0) { @@ -26,19 +25,18 @@ class MyFormatter : public CLI::Formatter { out << " (" << get_label("Env") << ":" << opt->get_envname() << ")"; if (!opt->get_needs().empty()) { out << " " << get_label("Needs") << ":"; - for (const CLI::Option* op : opt->get_needs()) + for (const CLI::Option *op: opt->get_needs()) out << " " << op->get_name(); } if (!opt->get_excludes().empty()) { out << " " << get_label("Excludes") << ":"; - for (const CLI::Option* op : opt->get_excludes()) + for (const CLI::Option *op: opt->get_excludes()) out << " " << op->get_name(); } return out.str(); } - std::string make_option_desc(const CLI::Option* opt) const override - { + std::string make_option_desc(const CLI::Option *opt) const override { std::stringstream out; out << opt->get_description(); if (!opt->get_default_str().empty()) { @@ -48,13 +46,12 @@ class MyFormatter : public CLI::Formatter { } }; -int main(int argc, char* argv[]) -{ - CLI::App app { "Charon: Categorize reads into a small number of classes." }; +int main(int argc, char *argv[]) { + CLI::App app{"Charon: Categorize reads into a small number of classes."}; app.formatter(std::make_shared()); app.add_flag_callback("-V,--version", []() { std::cout << "Charon version: " << SOFTWARE_VERSION << std::endl; - throw(CLI::Success {}); + throw (CLI::Success{}); }); setup_index_subcommand(app); setup_classify_subcommand(app); diff --git a/src/utils.cpp b/src/utils.cpp index 68d0bd2..d74a9b7 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -6,10 +6,10 @@ std::filesystem::path make_absolute(std::filesystem::path path) { return std::filesystem::absolute(path); } -std::vector split(const std::string& s, const std::string& delimiter){ +std::vector split(const std::string &s, const std::string &delimiter) { std::vector substrings; - int start, end = -1*delimiter.size(); + int start, end = -1 * delimiter.size(); do { start = end + delimiter.size(); end = s.find(delimiter, start); @@ -19,68 +19,60 @@ std::vector split(const std::string& s, const std::string& delimite return substrings; } -bool ends_with(std::string str, std::string suffix) -{ - return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; +bool ends_with(std::string str, std::string suffix) { + return str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; } -bool starts_with(std::string str, std::string prefix) -{ +bool starts_with(std::string str, std::string prefix) { return str.size() >= prefix.size() && str.compare(0, prefix.size(), prefix) == 0; } -void store_hashes( const std::string target, - const std::unordered_set< uint64_t >& hashes, - const std::string tmp_output_folder ) - { - /* - * store hashes from set to disk in the specified folder (or current folder ".") - */ - std::filesystem::path outf{ tmp_output_folder }; - outf += "/" + target + ".min"; - std::ofstream outfile{ outf, std::ios::binary | std::ios::app }; - for ( auto&& h : hashes ) - { - outfile.write( reinterpret_cast< const char* >( &h ), sizeof( h ) ); - } - outfile.close(); +void store_hashes(const std::string target, + const std::unordered_set &hashes, + const std::string tmp_output_folder) { + /* + * store hashes from set to disk in the specified folder (or current folder ".") + */ + std::filesystem::path outf{tmp_output_folder}; + outf += "/" + target + ".min"; + std::ofstream outfile{outf, std::ios::binary | std::ios::app}; + for (auto &&h: hashes) { + outfile.write(reinterpret_cast< const char * >( &h ), sizeof(h)); } + outfile.close(); +} - std::vector< uint64_t > load_hashes( const std::string target, - const std::string tmp_output_folder ) - { - /* - * load hashes file from disk and returns them in a vector - */ - uint64_t hash; - std::vector< uint64_t > hashes; - std::filesystem::path file{ tmp_output_folder }; - file += "/" + target + ".min"; - std::ifstream infile{ file, std::ios::binary }; - while ( infile.read( reinterpret_cast< char* >( &hash ), sizeof( hash ) ) ) - hashes.push_back( hash ); - return hashes; - } +std::vector load_hashes(const std::string target, + const std::string tmp_output_folder) { + /* + * load hashes file from disk and returns them in a vector + */ + uint64_t hash; + std::vector hashes; + std::filesystem::path file{tmp_output_folder}; + file += "/" + target + ".min"; + std::ifstream infile{file, std::ios::binary}; + while (infile.read(reinterpret_cast< char * >( &hash ), sizeof(hash))) + hashes.push_back(hash); + return hashes; +} -void delete_hashes( const std::vector& targets, const std::string tmp_output_folder ) -{ +void delete_hashes(const std::vector &targets, const std::string tmp_output_folder) { /* * delete hashes from disk */ - for ( const auto & target: targets ) - { - std::filesystem::path outf{ tmp_output_folder }; + for (const auto &target: targets) { + std::filesystem::path outf{tmp_output_folder}; outf += "/" + std::to_string(target) + ".min"; - if ( std::filesystem::exists( outf ) ) - std::filesystem::remove( outf ); + if (std::filesystem::exists(outf)) + std::filesystem::remove(outf); } - if ( std::filesystem::is_empty( tmp_output_folder ) ) + if (std::filesystem::is_empty(tmp_output_folder)) std::filesystem::remove(tmp_output_folder); } -size_t bin_size_in_bits(const IndexArguments & opt, const uint64_t & num_elements) -{ +size_t bin_size_in_bits(const IndexArguments &opt, const uint64_t &num_elements) { assert(opt.num_hash > 0); assert(opt.max_fpr > 0.0); assert(opt.max_fpr < 1.0); @@ -89,15 +81,15 @@ size_t bin_size_in_bits(const IndexArguments & opt, const uint64_t & num_element double const denominator{std::log(1 - std::exp(std::log(opt.max_fpr) / opt.num_hash))}; double const result{std::ceil(numerator / denominator)}; - if (result > opt.bits){ - PLOG_WARNING << "Require " << +result << " bits for max_fpr " << opt.max_fpr << " but only have " << +opt.bits << " bits available"; + if (result > opt.bits) { + PLOG_WARNING << "Require " << +result << " bits for max_fpr " << opt.max_fpr << " but only have " << +opt.bits + << " bits available"; return opt.bits; } return result; } -size_t max_num_hashes_for_fpr(const IndexArguments & opt) -{ +size_t max_num_hashes_for_fpr(const IndexArguments &opt) { assert(opt.bits > 0); assert(opt.max_fpr > 0.0); assert(opt.max_fpr < 1.0); @@ -108,30 +100,32 @@ size_t max_num_hashes_for_fpr(const IndexArguments & opt) return result; } + //std::__tuple_element_t,std::vector> -std::string sequence_to_string(const __type_pack_element<0, std::vector, std::string, std::vector>& input){ +std::string sequence_to_string( + const __type_pack_element<0, std::vector, std::string, std::vector> &input) { std::vector sequence{}; - for (auto c : input) + for (auto c: input) sequence.push_back(seqan3::to_char(c)); std::string str(sequence.begin(), sequence.end()); return str; } -float get_compression_ratio(const std::string& sequence){ - const char * pointer = sequence.data(); +float get_compression_ratio(const std::string &sequence) { + const char *pointer = sequence.data(); std::size_t initial_size = sequence.size(); std::string compressed_data = gzip::compress(pointer, initial_size); - const char * compressed_pointer = compressed_data.data(); + const char *compressed_pointer = compressed_data.data(); std::size_t compressed_size = compressed_data.size(); assert(compressed_size != 0); - return static_cast(compressed_size)/static_cast(initial_size); + return static_cast(compressed_size) / static_cast(initial_size); } -std::string get_extension(const std::filesystem::path read_file){ +std::string get_extension(const std::filesystem::path read_file) { auto ext = read_file.extension().string(); - if (ext == ".gz"){ + if (ext == ".gz") { std::filesystem::path short_read_file = read_file.stem(); ext = short_read_file.extension().string(); }