Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
# charon
# =============================================================================
set(PROJECT_NAME_STR charon)
cmake_minimum_required( VERSION 3.9 FATAL_ERROR )
cmake_minimum_required(VERSION 3.9 FATAL_ERROR)

add_custom_target(version
${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/version.h.in
-D DST=${CMAKE_SOURCE_DIR}/include/version.h
-P ${CMAKE_SOURCE_DIR}/cmake/GenerateVersion.cmake
)

project( ${PROJECT_NAME_STR} LANGUAGES CXX )
project(${PROJECT_NAME_STR} LANGUAGES CXX)

# Use C++20
set(CMAKE_CXX_STANDARD 20)
Expand All @@ -22,34 +22,34 @@ set(CMAKE_CXX_EXTENSIONS OFF)
# -----------------------------------------------------------------------------
# dependencies and 3rd party libraries
# -----------------------------------------------------------------------------
set (PROGRAM_SUBMODULES_DIR
"${CMAKE_CURRENT_LIST_DIR}/lib"
CACHE STRING "Directory containing submodules."
set(PROGRAM_SUBMODULES_DIR
"${CMAKE_CURRENT_LIST_DIR}/lib"
CACHE STRING "Directory containing submodules."
)

# Specify the directories where to store the built archives, libraries and executables
set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
set (CMAKE_INSTALL_BINDIR "bin")
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
set(CMAKE_INSTALL_BINDIR "bin")

# Require OPENMP
find_package(OpenMP REQUIRED)
find_package( ZLIB REQUIRED )
find_package(ZLIB REQUIRED)

set (CPM_INDENT "CMake Package Manager CPM: ")
include (${PROJECT_SOURCE_DIR}/cmake/CPM.cmake)
CPMUsePackageLock (${PROJECT_SOURCE_DIR}/cmake/package-lock.cmake)
set(CPM_INDENT "CMake Package Manager CPM: ")
include(${PROJECT_SOURCE_DIR}/cmake/CPM.cmake)
CPMUsePackageLock(${PROJECT_SOURCE_DIR}/cmake/package-lock.cmake)

CPMGetPackage (seqan3)
CPMGetPackage (plog)
CPMGetPackage (gcem)
CPMGetPackage (statslib)
CPMGetPackage (bzip2)
CPMGetPackage (zlib)
CPMGetPackage (gzip)
CPMGetPackage(seqan3)
CPMGetPackage(plog)
CPMGetPackage(gcem)
CPMGetPackage(statslib)
CPMGetPackage(bzip2)
CPMGetPackage(zlib)
CPMGetPackage(gzip)

add_definitions(-D ZLIB_CONST )
add_definitions(-D ZLIB_CONST)

# -----------------------------------------------------------------------------
# install
Expand Down
33 changes: 15 additions & 18 deletions include/classify_arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,35 @@ struct ClassifyArguments {
// IO options
std::filesystem::path read_file;
std::filesystem::path read_file2;
bool is_paired { false };
bool is_paired{false};
std::string db;
uint8_t chunk_size { 100 };
uint8_t chunk_size{100};


// Stats options
float lo_hi_threshold {0.15};
uint16_t num_reads_to_fit {5000};
float lo_hi_threshold{0.15};
uint16_t num_reads_to_fit{5000};
std::string dist{"beta"};

// thresholds for filtering
float min_quality { 10.0 };
uint32_t min_length { 140 };
float min_compression {0.15};
uint8_t confidence_threshold{0};
uint8_t min_hits{2};
float min_proportion_difference { 0.00 };
float min_quality{10.0};
uint32_t min_length{140};
float min_compression{0.15};
uint8_t confidence_threshold{2};
float min_proportion_difference{0.00};

// Output options
bool run_extract {false};
bool run_extract{false};
std::string category_to_extract;
std::string prefix;
std::unordered_map<uint8_t, std::vector<std::filesystem::path>> extract_category_to_file;

// General options
std::string log_file {"charon.log"};
uint8_t threads { 1 };
uint8_t verbosity { 0 };
std::string log_file{"charon.log"};
uint8_t threads{1};
uint8_t verbosity{0};

std::string to_string()
{
std::string to_string() {
std::string ss;

ss += "\n\nClassify Arguments:\n\n";
Expand All @@ -58,8 +56,7 @@ struct ClassifyArguments {
ss += "\tmin_quality:\t\t" + std::to_string(min_quality) + "\n";
ss += "\tmin_compression:\t\t" + std::to_string(min_compression) + "\n";
ss += "\tconfidence_threshold:\t" + std::to_string(confidence_threshold) + "\n";
ss += "\tmin_hits:\t\t" + std::to_string(min_hits) + "\n";
ss += "\tmin_diff:\t\t" + std::to_string(min_proportion_difference) + "\n\n";
ss += "\tmin_proportion_diff:\t\t" + std::to_string(min_proportion_difference) + "\n\n";

ss += "\tcategory_to_extract:\t" + category_to_extract + "\n";
ss += "\tprefix:\t" + prefix + "\n\n";
Expand Down
7 changes: 4 additions & 3 deletions include/classify_main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
#include "result.hpp"

class Index;

struct ClassifyArguments;

void setup_classify_subcommand(CLI::App& app);
void setup_classify_subcommand(CLI::App &app);

void classify_reads(const ClassifyArguments& opt, const Index& index);
void classify_reads(const ClassifyArguments &opt, const Index &index);

int classify_main(ClassifyArguments & opt);
int classify_main(ClassifyArguments &opt);


#endif // CHARON_CLASSIFY_MAIN_H
30 changes: 18 additions & 12 deletions include/classify_stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ struct KDEParams {
std::sort(dataset.begin(), dataset.end());
}

float quantile(const float& alpha) const {
float quantile(const float &alpha) const {
int idx = std::ceil((1. - alpha) * dataset.size());
return dataset[idx];
}
Expand All @@ -227,7 +227,7 @@ struct KDEParams {
const auto mu = mean(dataset);
const auto var = variance(dataset, mu);
const auto std = sqrt(var);
const auto A = std::min(iqr/1.34, std);
const auto A = std::min(iqr / 1.34, std);
h = 0.9 * A * pow(dataset.size(), -0.2);
}

Expand All @@ -239,16 +239,16 @@ struct KDEParams {
PLOG_INFO << "Fitting KDE to data with factor " << h;
}

float K(const float & x) const {
return std::exp(-std::pow(x,2)/2)/sqrt(2*3.141592653589793238463);
float K(const float &x) const {
return std::exp(-std::pow(x, 2) / 2) / sqrt(2 * 3.141592653589793238463);
}

float prob(const float & x) const {
float prob(const float &x) const {
float total_sum = 0;
for (const auto &xi : dataset){
for (const auto &xi: dataset) {
total_sum += K((x - xi) / h);
}
return total_sum/(h*dataset.size());
return total_sum / (h * dataset.size());
}

};
Expand All @@ -267,8 +267,8 @@ class Model {
GammaParams g_neg{10, 0, 0.005};
BetaParams b_pos{6, 4};
BetaParams b_neg{6, 40};
KDEParams k_pos{default_pos_data,0.1};
KDEParams k_neg{default_neg_data,0.001};
KDEParams k_pos{default_pos_data, 0.1};
KDEParams k_neg{default_neg_data, 0.001};
public:
Model() = default;

Expand Down Expand Up @@ -308,7 +308,8 @@ class Model {
g_neg.fit_loc(training_data.neg);
//auto ad = g_neg.calculate_anderson_darling(training_data.neg);
PLOG_INFO << "Model " << +id << " using default for g_neg data with Gamma (shape:" << g_neg.shape
<< ", loc: " << g_neg.loc << ", scale: " << g_neg.scale << ")";//. Anderson-darling statistic is " << ad;
<< ", loc: " << g_neg.loc << ", scale: " << g_neg.scale
<< ")";//. Anderson-darling statistic is " << ad;
}
}

Expand Down Expand Up @@ -401,6 +402,7 @@ class StatsModel {
uint32_t min_length_;
float min_compression_;
int8_t confidence_threshold_;
float confidence_probability_threshold_;
uint8_t min_hits_;
float host_unique_prop_lo_threshold_;
float min_proportion_difference_;
Expand Down Expand Up @@ -429,7 +431,6 @@ class StatsModel {
min_length_(opt.min_length),
min_compression_(opt.min_compression),
confidence_threshold_(opt.confidence_threshold),
min_hits_(opt.min_hits),
min_proportion_difference_(opt.min_proportion_difference) {
for (auto i = 0; i < summary.num_categories(); ++i) {
models_.emplace_back(Model(i, opt.dist));
Expand All @@ -444,9 +445,10 @@ class StatsModel {
min_length_(opt.min_length),
min_compression_(opt.min_compression),
confidence_threshold_(opt.confidence_threshold),
confidence_probability_threshold_(opt.confidence_probability_threshold),
host_unique_prop_lo_threshold_(opt.host_unique_prop_lo_threshold),
min_proportion_difference_(opt.min_proportion_difference),
min_prob_difference_(opt.min_prob_difference){
min_prob_difference_(opt.min_prob_difference) {
for (auto i = 0; i < summary.num_categories(); ++i) {
models_.emplace_back(Model(i, opt.dist));
training_data_.emplace_back(TrainingData(opt, i));
Expand Down Expand Up @@ -496,6 +498,10 @@ class StatsModel {
return confidence_threshold_;
}

float confidence_probability_threshold() const {
return confidence_probability_threshold_;
}

uint8_t min_num_hits() const {
return min_hits_;
}
Expand Down
35 changes: 17 additions & 18 deletions include/counts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,44 +10,43 @@
// Extremly simple Counts class - represents a lower diagonal square matrix including the diagonal,
// expecting [row,col]==[col,row] so can store it once not twice
template<class T>
class Counts
{
class Counts {
public:
Counts(){};
Counts(size_t size)
{
Counts() {};

Counts(size_t size) {
assert(size > 0);
mRows = size;
mData.resize(mRows * (mRows+1) / 2);
mData.resize(mRows * (mRows + 1) / 2);
mData.shrink_to_fit();
}
void set_size(size_t size)
{

void set_size(size_t size) {
assert(size > 0);
mRows = size;
mData.resize(mRows * (mRows+1) / 2);
mData.resize(mRows * (mRows + 1) / 2);
mData.shrink_to_fit();
}
T& operator()(size_t row, size_t col)
{

T &operator()(size_t row, size_t col) {
assert(row < mRows);
assert(col < mRows);
if (row < col)
{
col,row = row,col;
if (row < col) {
col, row = row, col;
}
return mData[row * (row + 1) / 2 + col];
}
const T& operator()(size_t row, size_t col) const
{

const T &operator()(size_t row, size_t col) const {
assert(row >= 0 && row < mRows);
assert(col >= 0 && row < mRows);
return mData[row * (row + 1) / 2 + col];
}
size_t rows() const noexcept
{

size_t rows() const noexcept {
return mRows;
}

protected:
size_t mRows;
std::vector<T> mData;
Expand Down
37 changes: 18 additions & 19 deletions include/dehost_arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,39 @@ struct DehostArguments {
// IO options
std::filesystem::path read_file;
std::filesystem::path read_file2;
bool is_paired { false };
bool is_paired{false};
std::string db;

// Output options
bool run_extract {false};
bool run_extract{false};
std::string category_to_extract;
std::string prefix;
std::unordered_map<uint8_t, std::vector<std::filesystem::path>> extract_category_to_file;

uint8_t chunk_size { 100 };
uint8_t chunk_size{100};

// Stats options
float lo_hi_threshold {0.15};
uint16_t num_reads_to_fit {5000};
float lo_hi_threshold{0.15};
uint16_t num_reads_to_fit{5000};
std::string dist{"kde"};

// thresholds for filtering
float min_quality { 15.0 };
uint32_t min_length { 140 };
float min_compression {0};
uint8_t confidence_threshold{2};
float host_unique_prop_lo_threshold{ 0.05 };
float min_proportion_difference { 0.04 };
float min_prob_difference{ 0 };

float min_quality{15.0};
uint32_t min_length{140};
float min_compression{0};
uint8_t confidence_threshold{7};
float confidence_probability_threshold{0};
float host_unique_prop_lo_threshold{0.05};
float min_proportion_difference{0.04};
float min_prob_difference{0};


// General options
std::string log_file {"charon.log"};
uint8_t threads { 1 };
uint8_t verbosity { 0 };
std::string log_file{"charon.log"};
uint8_t threads{1};
uint8_t verbosity{0};

std::string to_string()
{
std::string to_string() {
std::string ss;

ss += "\n\nDehost Arguments:\n\n";
Expand All @@ -63,12 +62,12 @@ struct DehostArguments {
ss += "\tmin_quality:\t\t\t" + std::to_string(min_quality) + "\n";
ss += "\tmin_compression:\t\t" + std::to_string(min_compression) + "\n";
ss += "\tconfidence_threshold:\t\t" + std::to_string(confidence_threshold) + "\n";
ss += "\tconfidence_probability_threshold:\t\t" + std::to_string(confidence_probability_threshold) + "\n";
ss += "\thost_unique_prop_lo_threshold:\t" + std::to_string(host_unique_prop_lo_threshold) + "\n";
ss += "\tmin_proportion_difference:\t" + std::to_string(min_proportion_difference) + "\n";
ss += "\tmin_prob_difference:\t\t" + std::to_string(min_prob_difference) + "\n\n";



ss += "\tlog_file:\t\t\t" + log_file + "\n";
ss += "\tthreads:\t\t\t" + std::to_string(threads) + "\n";
ss += "\tverbosity:\t\t\t" + std::to_string(verbosity) + "\n\n";
Expand Down
Loading