Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions include/sdsl/lcp_construct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
\brief lcp_construct.hpp contains a space and time efficient construction method for lcp arrays
\author Simon Gog
*/

#ifndef INCLUDED_SDSL_LCP_CONSTRUCT
#define INCLUDED_SDSL_LCP_CONSTRUCT

Expand All @@ -28,6 +29,7 @@
#include "testutils.hpp"
#include "isa_construct.hpp"
#include "bwt_construct.hpp"
#include "wt_huff.hpp"

#include <iostream>
#include <stdexcept>
Expand Down Expand Up @@ -116,9 +118,6 @@ void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t
m_list[c].push_back(i);
}

// only phase 1 of the new algorithm
bool construct_lcp_simple_5n(tMSS& file_map, const std::string& dir, const std::string& id);

// only phase 2 of the new algorithm
// TODO: assert n > 0
bool construct_lcp_simple2_9n(tMSS& file_map, const std::string& dir, const std::string& id);
Expand Down Expand Up @@ -153,6 +152,30 @@ bool construct_lcp_goPHI(tMSS& file_map, const std::string& dir, const std::stri
*/
bool construct_lcp_go2(tMSS& file_map, const std::string& dir, const std::string& id);

//! 2.5n byte variant of the algorithm of Beller et al. (SPIRE 2011, "Computing the Longest Common Prefix Array Based on the Burrows-Wheeler Transform")
/*! The algorithm computes the lcp array and stores it to disk. It needs only the Burrows and Wheeler transform.
* \param file_map A map which contains the filenames of previous computed structures (like Burrows and Wheeler transform)
* \param dir Directory where the lcp array should be stored.
* \param id Id for the file name of the lcp array.
* \par Time complexity
* Usually \f$ \Order{n \log{\sigma}} \f$
* \par Space complexity
* Usually less than \f$ 2.5n \f$ bytes
*/
bool construct_lcp_bwt_based(tMSS& file_map, const std::string& dir, const std::string& id);

//! 1.5n byte variant of the algorithm of Beller et al. (Journal of Discrete Algorithms ISSN 1570-8667, 10.1016/j.jda.2012.07.007, "Computing the Longest Common Prefix Array Based on the Burrows-Wheeler Transform")
/*! The algorithm computes the lcp array and stores it to disk. It needs only the Burrows and Wheeler transform.
* \param file_map A map which contains the filenames of previous computed structures (like Burrows and Wheeler transform)
* \param dir Directory where the lcp array should be stored.
* \param id Id for the file name of the lcp array.
* \par Time complexity
* \f$ \Order{n \log{\sigma}} \f$
* \par Space complexity
* Usually not more than \f$ 1.5n \f$ bytes
*/
bool construct_lcp_bwt_based2(tMSS& file_map, const std::string& dir, const std::string& id);

void lcp_info(tMSS& file_map);

}// end namespace
Expand Down
2 changes: 1 addition & 1 deletion include/sdsl/sorted_multi_stack_support.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
\author Simon Gog
*/
#ifndef INCLUDED_SDSL_SORTED_MULTI_STACK_SUPPORT
#define INCLUDED_SDSL_SORTED_MUTLI_STACK_SUPPORT
#define INCLUDED_SDSL_SORTED_MULTI_STACK_SUPPORT

#include "int_vector.hpp"
#include "bitmagic.hpp"
Expand Down
1 change: 1 addition & 0 deletions include/sdsl/uint128_t.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace sdsl

typedef unsigned int uint128_t __attribute__((mode(TI)));

inline
std::ostream& operator<<(std::ostream& os, const uint128_t& x)
{
uint64_t X[2] = {(uint64_t)(x >> 64), (uint64_t)x};
Expand Down
1 change: 1 addition & 0 deletions include/sdsl/uint256_t.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ class uint256_t
}
};

inline
std::ostream& operator<<(std::ostream& os, const uint256_t& x)
{
uint64_t X[4] = {(uint64_t)(x.m_high >> 64), (uint64_t)x.m_high, x.m_mid, x.m_lo};
Expand Down
60 changes: 60 additions & 0 deletions include/sdsl/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,26 @@ typename int_vector_type::size_type get_onezero_bits(const int_vector_type& v);
template <class int_vector_type>
typename int_vector_type::size_type get_zeroone_bits(const int_vector_type& v);

//! Get the smallest position \f$i\geq idx\f$ where a bit is set
/*! \param v The int_vector in which the bit is searched
* \param idx The start position for the search \f$ 0\leq idx < v.bit_size()\f$
* \return The smallest position greater or equal to idx, where corresponding bit is 1 or v.bit_size() if no such position exists
* \par Time complexity
* \f$ \Order{n} \f$
*/
template <class int_vector_type>
typename int_vector_type::size_type next_bit(const int_vector_type& v, uint64_t idx);

//! Get the greatest position \f$i\leq idx\f$ where a bit is set
/*! \param v The int_vector in which the bit is searched
* \param idx The start position for the search \f$ 0\leq idx < v.bit_size()\f$
* \return The greatest position smaller or equal to idx, where corresponding bit is 1 or v.bit_size() if no such position exists
* \par Time complexity
* \f$ \Order{n} \f$
*/
template <class int_vector_type>
typename int_vector_type::size_type prev_bit(const int_vector_type& v, uint64_t idx);

//! Load a data structure from a file.
/*! The data structure has to provide a load function.
* \param v Data structure to load.
Expand Down Expand Up @@ -614,6 +634,46 @@ typename int_vector_type::size_type util::get_zeroone_bits(const int_vector_type
return result;
}

template <class int_vector_type>
typename int_vector_type::size_type util::next_bit(const int_vector_type& v, uint64_t idx)
{
uint64_t pos = idx>>6;
uint64_t node = v.data()[pos];
node >>= (idx&0x3F);
if(node) {
return idx+bit_magic::r1BP(node);
} else {
++pos;
while((pos<<6) < v.bit_size()) {
if(v.data()[pos]) {
return (pos<<6)|bit_magic::r1BP(v.data()[pos]);
}
++pos;
}
return v.bit_size();
}
}

template <class int_vector_type>
typename int_vector_type::size_type util::prev_bit(const int_vector_type& v, uint64_t idx)
{
uint64_t pos = idx>>6;
uint64_t node = v.data()[pos];
node <<= 63-(idx&0x3F);
if(node) {
return bit_magic::l1BP(node)+(pos<<6)-(63-(idx&0x3F));
} else {
--pos;
while((pos<<6) < v.bit_size() ) {
if(v.data()[pos]) {
return (pos<<6)|bit_magic::l1BP(v.data()[pos]);
}
--pos;
}
return v.bit_size();
}
}

template<typename T>
std::string util::to_string(const T& t)
{
Expand Down
94 changes: 53 additions & 41 deletions lib/bwt_construct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,55 +107,67 @@ bool construct_bwt2(tMSS& file_map, const std::string& dir, const std::string& i
{
typedef int_vector<>::size_type size_type;
write_R_output("csa", "construct BWT", "begin", 1, 0);
// if( file_map.find("bwt") == file_map.end() ){ // if bwt is not already on disk => calculate it
int_vector_file_buffer<> sa_buf(file_map["sa"].c_str());
const size_type n = sa_buf.int_vector_size;
if( file_map.find("bwt") == file_map.end() ){ // if bwt is not already on disk => calculate it
std::string bwt_file_name = dir+"bwt_"+id;
std::ifstream bwt_in(bwt_file_name.c_str());
// check if bwt is already on disk => register it
if (bwt_in) {
file_map["bwt"] = bwt_file_name;
bwt_in.close();
return true;
}
int_vector_file_buffer<> sa_buf(file_map["sa"].c_str());
const size_type n = sa_buf.int_vector_size;

if (n < 3)
return construct_bwt(file_map, dir, id);
if (n < 3)
return construct_bwt(file_map, dir, id);

unsigned char* text = NULL;
int_vector_file_buffer<8> text_buf(file_map["text"].c_str());
util::load_from_int_vector_buffer(text, text_buf);
unsigned char* text = NULL;
int_vector_file_buffer<8> text_buf(file_map["text"].c_str());
util::load_from_int_vector_buffer(text, text_buf);

size_type cnt_c[257] = {0}; // counter for each character in the text
size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
// unsigned char alphabet[257] = {0};
// uint8_t sigma = 0;
size_type cnt_c[257] = {0}; // counter for each character in the text
size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
// unsigned char alphabet[257] = {0};
// uint8_t sigma = 0;

write_R_output("csa", "construct C", "begin", 1, 0);
for (size_type i=0; i<n; ++i) { // initialize cnt_c
++cnt_c[text[i]+1];
}
write_R_output("csa", "construct C", "end", 1, 0);
for (int i=1; i<257; ++i) { // calculate sigma and initailize cnt_cc
// if( cnt_c[i] > 0 ){ alphabet[sigma++] = (unsigned char)(i-1); }
cnt_cc[i] = cnt_c[i] + cnt_cc[i-1];
}
// alphabet[sigma] = '\0';

size_type to_add[2] = {-1,n-1};

int_vector<8> bwt(n,0);
sa_buf.reset();
for (size_type i=0, sai, r=0, r_sum=0 ; r_sum < n;) {
for (; i < r_sum+r; ++i) {
uint8_t bwti;
sai = sa_buf[i-r_sum];
if (bwt[i]) { // if the current BWT entry is already done
bwti = bwt[i];
} else {
bwti = bwt[i] = text[sai+to_add[sai==0]];
size_type lf = cnt_cc[bwti];
if (lf > i and sai > 1) {
bwt[lf] = text[sai-2];
write_R_output("csa", "construct C", "begin", 1, 0);
for (size_type i=0; i<n; ++i) { // initialize cnt_c
++cnt_c[text[i]+1];
}
write_R_output("csa", "construct C", "end", 1, 0);
for (int i=1; i<257; ++i) { // calculate sigma and initailize cnt_cc
// if( cnt_c[i] > 0 ){ alphabet[sigma++] = (unsigned char)(i-1); }
cnt_cc[i] = cnt_c[i] + cnt_cc[i-1];
}
// alphabet[sigma] = '\0';

size_type to_add[2] = {-1,n-1};

int_vector<8> bwt(n,0);
sa_buf.reset();
for (size_type i=0, sai, r=0, r_sum=0 ; r_sum < n;) {
for (; i < r_sum+r; ++i) {
uint8_t bwti;
sai = sa_buf[i-r_sum];
if (bwt[i]) { // if the current BWT entry is already done
bwti = bwt[i];
} else {
bwti = bwt[i] = text[sai+to_add[sai==0]];
size_type lf = cnt_cc[bwti];
if (lf > i and sai > 1) {
bwt[lf] = text[sai-2];
}
}
++cnt_cc[bwti]; // update counter and therefore the LF information
}
++cnt_cc[bwti]; // update counter and therefore the LF information
r_sum += r; r = sa_buf.load_next_block();
}
if(!util::store_to_file(bwt, bwt_file_name.c_str())){
return false;
}
r_sum += r; r = sa_buf.load_next_block();
file_map["bwt"] = bwt_file_name;
}
// }
write_R_output("csa", "construct BWT", "end", 1, 0);
return true;
}
Expand Down
Loading