diff --git a/include/sdsl/lcp_construct.hpp b/include/sdsl/lcp_construct.hpp index 4222d31..0acb761 100644 --- a/include/sdsl/lcp_construct.hpp +++ b/include/sdsl/lcp_construct.hpp @@ -18,6 +18,7 @@ \brief lcp_construct.hpp contains a space and time efficient construction method for lcp arrays \author Simon Gog */ + #ifndef INCLUDED_SDSL_LCP_CONSTRUCT #define INCLUDED_SDSL_LCP_CONSTRUCT @@ -28,6 +29,7 @@ #include "testutils.hpp" #include "isa_construct.hpp" #include "bwt_construct.hpp" +#include "wt_huff.hpp" #include #include @@ -116,9 +118,6 @@ void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t m_list[c].push_back(i); } -// only phase 1 of the new algorithm -bool construct_lcp_simple_5n(tMSS& file_map, const std::string& dir, const std::string& id); - // only phase 2 of the new algorithm // TODO: assert n > 0 bool construct_lcp_simple2_9n(tMSS& file_map, const std::string& dir, const std::string& id); @@ -153,6 +152,30 @@ bool construct_lcp_goPHI(tMSS& file_map, const std::string& dir, const std::stri */ bool construct_lcp_go2(tMSS& file_map, const std::string& dir, const std::string& id); +//! 2.5n byte variant of the algorithm of Beller et al. (SPIRE 2011, "Computing the Longest Common Prefix Array Based on the Burrows-Wheeler Transform") +/*! The algorithm computes the lcp array and stores it to disk. It needs only the Burrows and Wheeler transform. + * \param file_map A map which contains the filenames of previous computed structures (like Burrows and Wheeler transform) + * \param dir Directory where the lcp array should be stored. + * \param id Id for the file name of the lcp array. + * \par Time complexity + * Usually \f$ \Order{n \log{\sigma}} \f$ + * \par Space complexity + * Usually less than \f$ 2.5n \f$ bytes + */ +bool construct_lcp_bwt_based(tMSS& file_map, const std::string& dir, const std::string& id); + +//! 1.5n byte variant of the algorithm of Beller et al. (Journal of Discrete Algorithms ISSN 1570-8667, 10.1016/j.jda.2012.07.007, "Computing the Longest Common Prefix Array Based on the Burrows-Wheeler Transform") +/*! The algorithm computes the lcp array and stores it to disk. It needs only the Burrows and Wheeler transform. + * \param file_map A map which contains the filenames of previous computed structures (like Burrows and Wheeler transform) + * \param dir Directory where the lcp array should be stored. + * \param id Id for the file name of the lcp array. + * \par Time complexity + * \f$ \Order{n \log{\sigma}} \f$ + * \par Space complexity + * Usually not more than \f$ 1.5n \f$ bytes + */ +bool construct_lcp_bwt_based2(tMSS& file_map, const std::string& dir, const std::string& id); + void lcp_info(tMSS& file_map); }// end namespace diff --git a/include/sdsl/sorted_multi_stack_support.hpp b/include/sdsl/sorted_multi_stack_support.hpp index a6f7e02..13e348e 100644 --- a/include/sdsl/sorted_multi_stack_support.hpp +++ b/include/sdsl/sorted_multi_stack_support.hpp @@ -20,7 +20,7 @@ \author Simon Gog */ #ifndef INCLUDED_SDSL_SORTED_MULTI_STACK_SUPPORT -#define INCLUDED_SDSL_SORTED_MUTLI_STACK_SUPPORT +#define INCLUDED_SDSL_SORTED_MULTI_STACK_SUPPORT #include "int_vector.hpp" #include "bitmagic.hpp" diff --git a/include/sdsl/uint128_t.hpp b/include/sdsl/uint128_t.hpp index ae61c99..a60644d 100644 --- a/include/sdsl/uint128_t.hpp +++ b/include/sdsl/uint128_t.hpp @@ -29,6 +29,7 @@ namespace sdsl typedef unsigned int uint128_t __attribute__((mode(TI))); +inline std::ostream& operator<<(std::ostream& os, const uint128_t& x) { uint64_t X[2] = {(uint64_t)(x >> 64), (uint64_t)x}; diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index 3b5bf8e..28bf5d4 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -232,6 +232,7 @@ class uint256_t } }; +inline std::ostream& operator<<(std::ostream& os, const uint256_t& x) { uint64_t X[4] = {(uint64_t)(x.m_high >> 64), (uint64_t)x.m_high, x.m_mid, x.m_lo}; diff --git a/include/sdsl/util.hpp b/include/sdsl/util.hpp index 02561f0..f854915 100644 --- a/include/sdsl/util.hpp +++ b/include/sdsl/util.hpp @@ -131,6 +131,26 @@ typename int_vector_type::size_type get_onezero_bits(const int_vector_type& v); template typename int_vector_type::size_type get_zeroone_bits(const int_vector_type& v); +//! Get the smallest position \f$i\geq idx\f$ where a bit is set +/*! \param v The int_vector in which the bit is searched + * \param idx The start position for the search \f$ 0\leq idx < v.bit_size()\f$ + * \return The smallest position greater or equal to idx, where corresponding bit is 1 or v.bit_size() if no such position exists + * \par Time complexity + * \f$ \Order{n} \f$ + */ +template +typename int_vector_type::size_type next_bit(const int_vector_type& v, uint64_t idx); + +//! Get the greatest position \f$i\leq idx\f$ where a bit is set +/*! \param v The int_vector in which the bit is searched + * \param idx The start position for the search \f$ 0\leq idx < v.bit_size()\f$ + * \return The greatest position smaller or equal to idx, where corresponding bit is 1 or v.bit_size() if no such position exists + * \par Time complexity + * \f$ \Order{n} \f$ +*/ +template +typename int_vector_type::size_type prev_bit(const int_vector_type& v, uint64_t idx); + //! Load a data structure from a file. /*! The data structure has to provide a load function. * \param v Data structure to load. @@ -614,6 +634,46 @@ typename int_vector_type::size_type util::get_zeroone_bits(const int_vector_type return result; } +template +typename int_vector_type::size_type util::next_bit(const int_vector_type& v, uint64_t idx) +{ + uint64_t pos = idx>>6; + uint64_t node = v.data()[pos]; + node >>= (idx&0x3F); + if(node) { + return idx+bit_magic::r1BP(node); + } else { + ++pos; + while((pos<<6) < v.bit_size()) { + if(v.data()[pos]) { + return (pos<<6)|bit_magic::r1BP(v.data()[pos]); + } + ++pos; + } + return v.bit_size(); + } +} + +template +typename int_vector_type::size_type util::prev_bit(const int_vector_type& v, uint64_t idx) +{ + uint64_t pos = idx>>6; + uint64_t node = v.data()[pos]; + node <<= 63-(idx&0x3F); + if(node) { + return bit_magic::l1BP(node)+(pos<<6)-(63-(idx&0x3F)); + } else { + --pos; + while((pos<<6) < v.bit_size() ) { + if(v.data()[pos]) { + return (pos<<6)|bit_magic::l1BP(v.data()[pos]); + } + --pos; + } + return v.bit_size(); + } +} + template std::string util::to_string(const T& t) { diff --git a/lib/bwt_construct.cpp b/lib/bwt_construct.cpp index f6d094e..4944aab 100644 --- a/lib/bwt_construct.cpp +++ b/lib/bwt_construct.cpp @@ -107,55 +107,67 @@ bool construct_bwt2(tMSS& file_map, const std::string& dir, const std::string& i { typedef int_vector<>::size_type size_type; write_R_output("csa", "construct BWT", "begin", 1, 0); -// if( file_map.find("bwt") == file_map.end() ){ // if bwt is not already on disk => calculate it - int_vector_file_buffer<> sa_buf(file_map["sa"].c_str()); - const size_type n = sa_buf.int_vector_size; + if( file_map.find("bwt") == file_map.end() ){ // if bwt is not already on disk => calculate it + std::string bwt_file_name = dir+"bwt_"+id; + std::ifstream bwt_in(bwt_file_name.c_str()); + // check if bwt is already on disk => register it + if (bwt_in) { + file_map["bwt"] = bwt_file_name; + bwt_in.close(); + return true; + } + int_vector_file_buffer<> sa_buf(file_map["sa"].c_str()); + const size_type n = sa_buf.int_vector_size; - if (n < 3) - return construct_bwt(file_map, dir, id); + if (n < 3) + return construct_bwt(file_map, dir, id); - unsigned char* text = NULL; - int_vector_file_buffer<8> text_buf(file_map["text"].c_str()); - util::load_from_int_vector_buffer(text, text_buf); + unsigned char* text = NULL; + int_vector_file_buffer<8> text_buf(file_map["text"].c_str()); + util::load_from_int_vector_buffer(text, text_buf); - size_type cnt_c[257] = {0}; // counter for each character in the text - size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c -// unsigned char alphabet[257] = {0}; -// uint8_t sigma = 0; + size_type cnt_c[257] = {0}; // counter for each character in the text + size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c +// unsigned char alphabet[257] = {0}; +// uint8_t sigma = 0; - write_R_output("csa", "construct C", "begin", 1, 0); - for (size_type i=0; i 0 ){ alphabet[sigma++] = (unsigned char)(i-1); } - cnt_cc[i] = cnt_c[i] + cnt_cc[i-1]; - } -// alphabet[sigma] = '\0'; - - size_type to_add[2] = {-1,n-1}; - - int_vector<8> bwt(n,0); - sa_buf.reset(); - for (size_type i=0, sai, r=0, r_sum=0 ; r_sum < n;) { - for (; i < r_sum+r; ++i) { - uint8_t bwti; - sai = sa_buf[i-r_sum]; - if (bwt[i]) { // if the current BWT entry is already done - bwti = bwt[i]; - } else { - bwti = bwt[i] = text[sai+to_add[sai==0]]; - size_type lf = cnt_cc[bwti]; - if (lf > i and sai > 1) { - bwt[lf] = text[sai-2]; + write_R_output("csa", "construct C", "begin", 1, 0); + for (size_type i=0; i 0 ){ alphabet[sigma++] = (unsigned char)(i-1); } + cnt_cc[i] = cnt_c[i] + cnt_cc[i-1]; + } +// alphabet[sigma] = '\0'; + + size_type to_add[2] = {-1,n-1}; + + int_vector<8> bwt(n,0); + sa_buf.reset(); + for (size_type i=0, sai, r=0, r_sum=0 ; r_sum < n;) { + for (; i < r_sum+r; ++i) { + uint8_t bwti; + sai = sa_buf[i-r_sum]; + if (bwt[i]) { // if the current BWT entry is already done + bwti = bwt[i]; + } else { + bwti = bwt[i] = text[sai+to_add[sai==0]]; + size_type lf = cnt_cc[bwti]; + if (lf > i and sai > 1) { + bwt[lf] = text[sai-2]; + } } + ++cnt_cc[bwti]; // update counter and therefore the LF information } - ++cnt_cc[bwti]; // update counter and therefore the LF information + r_sum += r; r = sa_buf.load_next_block(); + } + if(!util::store_to_file(bwt, bwt_file_name.c_str())){ + return false; } - r_sum += r; r = sa_buf.load_next_block(); + file_map["bwt"] = bwt_file_name; } -// } write_R_output("csa", "construct BWT", "end", 1, 0); return true; } diff --git a/lib/lcp_construct.cpp b/lib/lcp_construct.cpp index 8725838..78a5344 100644 --- a/lib/lcp_construct.cpp +++ b/lib/lcp_construct.cpp @@ -384,135 +384,6 @@ uint8_t buffered_char_queue::pop_front() return x; } -bool construct_lcp_simple_5n(tMSS& file_map, const std::string& dir, const std::string& id) -{ - typedef int_vector<>::size_type size_type; - write_R_output("lcp","construct LCP", "begin", 1, 0); - construct_bwt(file_map, dir, id); - - int_vector_file_buffer<> sa_buf(file_map["sa"].c_str()); // initialize buffer for suffix array - sa_buf.load_next_block(); - size_type sai_1 = sa_buf[0]; // store value of sa[i-1] - int_vector_file_buffer<8> bwt_buf(file_map["bwt"].c_str()); // initialize buffer of bwt - size_type r = bwt_buf.load_next_block(); - uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1] - int_vector<8> text; - util::load_from_file(text, file_map["text"].c_str()); - - const size_type n = sa_buf.int_vector_size; - - size_type cnt_c[257] = {0}; // counter for each character in the text - size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c - size_type prev_occ_in_bwt[256] = {0}; // position of the previous occurence of each character c in the bwt - for (size_type i=0; i 0) { - alphabet[sigma++] = (unsigned char)(i-1); - } - cnt_cc[i] = cnt_c[i] + cnt_cc[i-1]; - } - for (size_type i=0; i<256; ++i) prev_occ_in_bwt[i] = (size_type)-1; // initialze the array with -1 - - int_vector<> lcp(n, 0, sa_buf.int_width); - lcp[ cnt_cc[bwti_1]++ ] = 0; // lcp[ LF[0] ] = 0 - - const size_type update_stack = 1024; - - int_vector<64> rmq_stack(2*(update_stack + sigma + 8)); // initialize stack for (update_stack+sigma+8) elements representing (position, value) - rmq_stack[0] = 0; rmq_stack[1] = 0; // first element (-1, -1) - rmq_stack[2] = 1; rmq_stack[3] = 0; // second element (0, -1) - size_type rmq_end=3; // index of the value of the topmost element - const size_type rmq_limit = rmq_stack.size()-4; - uint8_t cur_c = alphabet[1]; - size_type comps = 0; - - size_type queries[257] = {0}; - for (size_type i=1, sai, r_sum=0, cur_c_idx=1, cur_c_cnt=cnt_c[alphabet[1]+1]; r_sum < n;) { - for (; i < r_sum+r; ++i, --cur_c_cnt) { - uint8_t bwti = bwt_buf[i-r_sum]; - sai = sa_buf[i-r_sum]; - size_type lf = cnt_cc[bwti]; - if (!cur_c_cnt) {// cur_c_cnt==0, if there is no more occurence of the current character - if (cur_c_cnt < sigma) { - cur_c_cnt = cnt_c[(cur_c=alphabet[++cur_c_idx])+1]; - } - } - size_type l=0; - if (i >= cnt_cc[cur_c]) { // if the current lcp entry is not already done - if (lf < i) { - l = lcp[lf] ? lcp[lf]-1 : 0; // l = LCP[LF[i]]-1; l < m+1 - if (bwti == bwti_1) - goto calculated_l; - } - while (text[sai_1+l] == text[sai+l]) { - ++l; - ++comps; - } -// ++comps; -calculated_l: - lcp[i] = l; - } else { // if already done - l = lcp[i]; // load LCP value - } - // begin update rmq_stack - size_type x = l+1; - size_type j = rmq_end; - while (x <= rmq_stack[j]) j-=2; // pop all elements with value >= l - rmq_stack[++j] = i+1; // push position i - rmq_stack[++j] = x; // push value l - rmq_end = j; // update index of the value of the topmost element - if (lf > i) { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq - // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i] - // rmq is linear in the stack size; can also be implemented with binary search on the stack - size_type x_pos = prev_occ_in_bwt[bwti]+2; - size_type j = rmq_end-3; - while (x_pos <= rmq_stack[j]) j-=2; // search smallest value in the interval I - lcp[lf] = rmq_stack[j+3]; - } - prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i] - ++cnt_cc[bwti]; // update counter and therefore the LF information - sai_1 = sai; // update SA[i-1] - bwti_1 = bwti; // update BWT[i-1] - if (rmq_end > rmq_limit) { -// std::cout<<"stack is too big (i="<= rmq_stack[jj] - rmq_stack[++new_rmq_end] = rmq_stack[j]; - rmq_stack[++new_rmq_end] = rmq_stack[j+1]; - } - } -// std::cout<<"rmq_end = "<::size_type size_type; @@ -1687,6 +1558,620 @@ bool construct_lcp_go2(tMSS& file_map, const std::string& dir, const std::string return true; } +//! Merges the LCP array \f$ lcp_small\f$ in memory () and the LCP array on harddisk to one LCP array on harddisk +/*! + * \param lcp_small First LCP array which should be merged + * \param finished_filename Path to second LCP array which should be merged + * \param finished_valid Array which values of the second LCP array are valid + * \param out_filename Path where resulting LCP array should be stored + * \param buffer_size Size of the buffer in byte + * \param N length of input + * \param LCP_value current LCP value + * \param lcp_value_offset: Largest LCP value in LCP array, that was written on harddisk + */ +void mergeToHD(int_vector<> &lcp_small, string finished_filename, bit_vector &finished_valid, string out_filename, int_vector<>::size_type buffer_size, unsigned long long N, unsigned long long LCP_value, unsigned long long lcp_value_offset) +{ + typedef int_vector<>::size_type size_type; + // open finshed array + int_vector_file_buffer<> finished_lcp_buffer(finished_filename.c_str(), buffer_size); + + // open output array + uint8_t int_width = bit_magic::l1BP(LCP_value-1)+1; + size_type bit_size = N*int_width; // Size of output file + size_type wb = 0; // Number of bits that were already written + int_vector<> out_buf(buffer_size, 0, int_width); // Outputbuffer + std::ofstream lcp_out_buf( (out_filename).c_str(), std::ios::binary | std::ios::trunc | std::ios::out ); + lcp_out_buf.write((char *) &(bit_size), sizeof(bit_size)); // Write length of vector + lcp_out_buf.write((char *) &(int_width), sizeof(int_width)); // Write int-width of vector + + // Write values into buffer + for(size_type i=0, r_sum=0, calc_idx=0, r=0, finished_lcp_buffer_valid=0; r_sum < N; ) { + // Copy next r values into buffer + for( ; i < r_sum+r; ++i) { + // Load finished_buffer + if(!finished_lcp_buffer_valid) { + finished_lcp_buffer_valid = finished_lcp_buffer.load_next_block(); + } + --finished_lcp_buffer_valid; + + // If values was already calculated + if(finished_valid[i]) { + // Copy value + out_buf[i-r_sum] = finished_lcp_buffer[i-r_sum]; + } else { + // If values was now calculated + if(lcp_small[calc_idx]) { + // Insert value + out_buf[i-r_sum] = lcp_small[calc_idx]+lcp_value_offset; + finished_valid[i] = true; + } else { + // Insert nothing + out_buf[i-r_sum] = 0; + } + ++calc_idx; + } + } + + // Write next r values from buffer to harddisk + if(r>0) { + size_type cur_wb = (r*out_buf.get_int_width()+7)/8; + lcp_out_buf.write((const char*)out_buf.data(), cur_wb ); + wb += cur_wb; + } + + // Count how many values were written and how many values will be written next + r_sum += r; + r = N-r_sum; + if(r>buffer_size) { + r = buffer_size; + } + } + // Close file + if(wb%8) { + lcp_out_buf.write("\0\0\0\0\0\0\0\0", 8-wb%8); + } + lcp_out_buf.close(); + + // Reset + util::set_zero_bits(lcp_small); + return; +} + +bool construct_lcp_bwt_based(tMSS& file_map, const std::string& dir, const std::string& id) +{ + typedef int_vector<>::size_type size_type; + write_R_output("lcp","construct LCP ","begin", 1, 0); + size_type buffer_size=1000000; // Size of the buffer + string outputfilenameschema = dir+"lcp_"+id+"_tmp_"; // Pattern for the temporary LCP arrays + + // create WaveletTree + write_R_output("bwt","load huffman WT ","begin", 0, 0); + int_vector_file_buffer<8> bwt_buf(file_map["bwt"].c_str(), buffer_size); + size_type N = bwt_buf.int_vector_size; // Input size + wt_huff, select_support_dummy, select_support_dummy> wt_bwt(bwt_buf, N); + write_R_output("bwt","load huffman WT ","end", 0, 0); + + // init + write_R_output("lcp","init ","begin", 0, 0); + size_type lcp_value = 0; // current LCP value + size_type lcp_value_offset = 0; // Largest LCP value in LCP array, that was written on harddisk + size_type phase = 0; // Count how often the LCP array was written on disk + + size_type intervals = 0; // Number of intervals which are currently stored + size_type intervals_new = 0; // Number of new intervals + + std::queue q; // Queue for storing the intervals + vector dict(2); // BitVector for storing the intervals + size_type source = 0, target = 1; // Defines which bitree is source and which is target + char last_used = 'q'; + size_type use_queue_and_wt = N/2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree + // else use dictionary and wavelet tree + + + size_type quantity; // quantity of characters in interval + vector pos2char(wt_bwt.sigma); // list of characters in the interval + vector rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1] + vector rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1] + + // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at moste 8 bit + size_type bb = (N*20-util::get_size_in_bytes(wt_bwt)*8*1.25-5*N)/N; // 20n - size of wavelet tree * 1.25 for rank support - 8n for bit arrays - n for finished array + if(N*20 < util::get_size_in_bytes(wt_bwt)*8*1.25+5*N) { +#ifdef STUDY_INFORMATIONS + std::cout << "Cannot caluclate LCP-Array with less than 2.5n bytes." << std::endl; +#endif + bb = 6; + } + if(bb>8) { + bb = 8; + } + size_type lcp_value_max = (1ULL<)=" << util::get_size_in_bytes(wt_bwt) << std::endl; +#endif + + // init lcp2 + int_vector<> lcp2(N+1, 0, bb); // LCP array + + // init finished + bit_vector finished(N+1, false); // Bitvector which is true, if corresponding LCP value was already calculated + rank_support_v<> ds_rank_support; // Rank support for bit_vector finished + + // create C-array + vector C(256, 0); // C-Array: C[i] = number of occurrences of characters < i in the input + wt_bwt.interval_symbols(0, N, quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i Queue","begin", 0, lcp_value); + dict[target].resize(1); + + // copy from bitvector to queue + size_type a2 = util::next_bit(dict[source], 0); + size_type b2 = util::next_bit(dict[source], a2+1); + while( b2 < dict[source].size() ) { + q.push((a2-1)>>1); + q.push(b2>>1); + // get next interval + a2 = util::next_bit(dict[source], b2+1); + b2 = util::next_bit(dict[source], a2+1); + } + dict[source].resize(1); + write_R_output("lcp","BitVector -> Queue","end ", 0, lcp_value); + } + if(intervals >= use_queue_and_wt && last_used == 'q') { + write_R_output("lcp","Queue -> BitVector","begin", 0, lcp_value); + size_type bitarray_length = N+1+64-(N+1)%64; // Length of the bitarray + dict[source].resize((bitarray_length<<1)+10); + util::set_zero_bits(dict[source]); + // copy from queue to bitvector + while(!q.empty()) { + dict[source][ (q.front()<<1)+1 ] = 1; q.pop(); + dict[source][ (q.front()<<1) ] = 1; q.pop(); + } + dict[target].resize((bitarray_length<<1)+10); + util::set_zero_bits(dict[target]); + write_R_output("lcp","Queue -> BitVector","end ", 0, lcp_value); + } + + if(intervals < use_queue_and_wt) { + last_used = 'q'; + intervals_new = 0; + while(intervals) { + // get next interval + size_type a = q.front(); q.pop(); + size_type b = q.front(); q.pop(); + --intervals; + + wt_bwt.interval_symbols(a, b, quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i>1), (b2>>1), quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i=lcp_value_max) { + write_R_output("lcp","write to file ","begin", lcp_value, 0); + if(phase) { + mergeToHD(lcp2, outputfilenameschema + util::to_string(phase-1), finished, outputfilenameschema + util::to_string(phase), buffer_size, N, lcp_value, lcp_value_offset); + } else { + lcp2.resize(N); + util::store_to_file(lcp2, (outputfilenameschema + util::to_string(phase)).c_str() ); + lcp2.resize(N+1); + util::set_zero_bits(lcp2); + } + write_R_output("lcp","write to file ","end", lcp_value, 0); + + write_R_output("lcp","resize variables ","begin", 0, 0); + // Create rank support + ds_rank_support.init(&finished); + + // Recalculate lcp_value_max and resize lcp2 + lcp_value_offset = lcp_value_max-1; + size_type remaining_lcp_values = finished.size()-ds_rank_support.rank(finished.size()); + + size_type int_width_new = space_in_bit_for_lcp / remaining_lcp_values; + if(int_width_new > bit_magic::l1BP(N-1)+1){ + int_width_new = bit_magic::l1BP(N-1)+1; + } + lcp_value_max = lcp_value_offset + (1ULL<::size_type size_type; + + size_type N; // Input length + size_type buffer_size=1000000; // Size of the buffer + size_type lcp_value = 0; // current LCP value + string filename_lcp_positions=dir+"lcp_"+id+"_tmp"; + + { // Begin of phase 1: Calculate LCP-Positions-Array + write_R_output("bwt","load huffman WT ","begin", 0, 0); + int_vector_file_buffer<8> bwt_buf(file_map["bwt"].c_str(), buffer_size); + N = bwt_buf.int_vector_size; // Input size + wt_huff, select_support_dummy, select_support_dummy> wt_bwt(bwt_buf, N); + write_R_output("bwt","load huffman WT ","end", 0, 0); + + // Declare needed variables + write_R_output("lcp","init ","begin", 0, 0); + + size_type intervals = 0; // Number of intervals which are currently stored + size_type intervals_new = 0; // Number of new intervals + + std::queue q; // Queue for storing the intervals + vector dict(2); // BitVector for storing the intervals + size_type source = 0, target = 1; // Defines which bitree is source and which is target + char last_used = 'q'; + size_type use_queue_and_wt = N/2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree + // else use dictionary and wavelet tree + + size_type quantity; // quantity of characters in interval + vector pos2char(wt_bwt.sigma); // list of characters in the interval + vector rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1] + vector rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1] + + // External storage of LCP-Positions + bool new_lcp_value = 0; + uint8_t int_width = bit_magic::l1BP(2*N+1)+1; + + size_type bit_size = (N+1)*int_width; // Size of output file in bit + size_type wb = 0; // Number of bits already written + std::ofstream lcp_positions((filename_lcp_positions).c_str(), std::ios::binary | std::ios::trunc | std::ios::out); + lcp_positions.write((char *) &(bit_size), sizeof(bit_size)); // Write length of vector + lcp_positions.write((char *) &(int_width), sizeof(int_width)); // Write int-width of vector + + int_vector<> lcp_tmp_buf(buffer_size, 0, int_width); // Create buffer for lcp_tmp + size_type idx_out_buf = 0; + + bit_vector finished(N+1, 0); // Bitvector which is true, if corresponding LCP value was already calculated + + // Create C-array + vector C(256, 0); // C-Array: C[i] = number of occurrences of characters < i in the input + wt_bwt.interval_symbols(0, N, quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i=lcp_tmp_buf.size()) { + // Write all values from buffer to harddisk + size_type cur_wb = (idx_out_buf*lcp_tmp_buf.get_int_width()+7)/8; + lcp_positions.write((const char*)lcp_tmp_buf.data(), cur_wb); + wb += cur_wb; + idx_out_buf = 0; + } + finished[0] = true; + + // Save first interval + q.push(0); + q.push(N); + intervals = 1; + + // Calulate LCP-Positions + while(intervals) { + if(intervals < use_queue_and_wt && last_used == 'b') { + write_R_output("lcp","BitVector -> Queue","begin", 0, lcp_value); + dict[target].resize(1); + + // Copy from bitvector to queue + size_type a2 = util::next_bit(dict[source], 0); + size_type b2 = util::next_bit(dict[source], a2+1); + while( b2 < dict[source].size() ) { + q.push((a2-1)>>1); + q.push(b2>>1); + // Get next interval + a2 = util::next_bit(dict[source], b2+1); + b2 = util::next_bit(dict[source], a2+1); + } + dict[source].resize(1); + write_R_output("lcp","BitVector -> Queue","end ", 0, lcp_value); + } + if(intervals >= use_queue_and_wt && last_used == 'q') { + write_R_output("lcp","Queue -> BitVector","begin", 0, lcp_value); + size_type bitarray_length = N+1+64-(N+1)%64; // Length of the bitarray + dict[source].resize((bitarray_length<<1)+10); + util::set_zero_bits(dict[source]); + // Copy from queue to bitvector + while(!q.empty()) { + dict[source][ (q.front()<<1)+1 ] = 1; q.pop(); + dict[source][ (q.front()<<1) ] = 1; q.pop(); + } + dict[target].resize((bitarray_length<<1)+10); + util::set_zero_bits(dict[target]); + write_R_output("lcp","Queue -> BitVector","end ", 0, lcp_value); + } + + if(intervals < use_queue_and_wt) { + last_used = 'q'; + intervals_new = 0; + while(intervals) { + // Get next interval + size_type a = q.front(); q.pop(); + size_type b = q.front(); q.pop(); + --intervals; + + wt_bwt.interval_symbols(a, b, quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i=lcp_tmp_buf.size()) { + // Write all values from buffer to harddisk + size_type cur_wb = (idx_out_buf*lcp_tmp_buf.get_int_width()+7)/8; + lcp_positions.write((const char*)lcp_tmp_buf.data(), cur_wb); + wb += cur_wb; + idx_out_buf = 0; + } + finished[b_new] = true; + + // Save interval + q.push(a_new); + q.push(b_new); + ++intervals_new; + } + } + } + intervals = intervals_new; + } else { + last_used = 'b'; + intervals = 0; + + // get next interval + size_type a2 = util::next_bit(dict[source], 0); + size_type b2 = util::next_bit(dict[source], a2+1); + + while( b2 < dict[source].size() ) { + wt_bwt.interval_symbols(((a2-1)>>1), (b2>>1), quantity, pos2char, rank_c_i, rank_c_j); + for(size_type i=0; i=lcp_tmp_buf.size()) { + // Write all values from buffer to harddisk + size_type cur_wb = (idx_out_buf*lcp_tmp_buf.get_int_width()+7)/8; + lcp_positions.write((const char*)lcp_tmp_buf.data(), cur_wb); + wb += cur_wb; + idx_out_buf = 0; + } + finished[b_new] = true; + + // Save interval + dict[target][ (a_new<<1)+1] = 1; + dict[target][ (b_new<<1) ] = 1; + ++intervals; + } + } + // get next interval + a2 = util::next_bit(dict[source], b2+1); + b2 = util::next_bit(dict[source], a2+1); + } + // switch source and target + source = 1-source; + target = 1-target; + util::set_zero_bits(dict[target]); + } + ++lcp_value; + new_lcp_value = true; + } + write_R_output("lcp","calc lcp values ","end ", 0, 0); + + // Write remaining values from buffer to harddisk + size_type cur_wb = (idx_out_buf*lcp_tmp_buf.get_int_width()+7)/8; + lcp_positions.write((const char*)lcp_tmp_buf.data(), cur_wb); + wb += cur_wb; + if(wb%8) { + lcp_positions.write("\0\0\0\0\0\0\0\0", 8-wb%8); + } + lcp_positions.close(); + } // End of phase 1 + + { // Begin phase 2: Calculate LCP-Array from the Positions-Array + write_R_output("lcp","reordering ","begin", 0, 0); + + int_vector_file_buffer<> lcp_positions((filename_lcp_positions).c_str(), buffer_size); + + uint8_t int_width = bit_magic::l1BP(lcp_value+1)+1; // How many bits are needed for one lcp_value? + size_type number_of_values = ((8*N)/int_width) & (~(0x7ULL)); // Determine number of lcp-values that can fit in n bytes = 8n bit and is a multiple of 8 + int_vector<> out_buf(number_of_values, 0, int_width); // Create Output Buffer + + // Create lcp_array + string output_filename = dir+"lcp_"+id; + size_type bit_size = N*int_width; // Length of LCP-array in bit + std::ofstream lcp_array(output_filename.c_str(), std::ios::binary | std::ios::trunc | std::ios::out ); + lcp_array.write((char *) &(bit_size), sizeof(bit_size)); // Write length of vector + lcp_array.write((char *) &(int_width), sizeof(int_width)); // Write int-width of vector + + size_type wb = 0; + for(size_type position_begin=0, position_end = number_of_values; position_begin0; position_begin=position_end, position_end+=number_of_values) { +#ifdef STUDY_INFORMATIONS +std::cout << "# fill lcp_values with " << position_begin << " <= position <" << position_end << ", each lcp-value has " << (int)int_width << " bit, lcp_value_max=" << lcp_value << std::endl; +#endif + + size_type lcp_value = 0, values=0; + lcp_positions.reset(); + for(size_type i=0, r_sum=0, r=0; r_sum < N; ) { + for( ; i < r_sum+r; ++i) { + size_type position = lcp_positions[i-r_sum]; + if(position>N) { + position = position-N; + ++lcp_value; + } + if(position_begin <= position and position < position_end) { + out_buf[position-position_begin] = lcp_value; + ++values; + } + } + r_sum += r; + r = lcp_positions.load_next_block(); + } + // Write next values from buffer to harddisk + if(values>0) { + size_type cur_wb = (values*out_buf.get_int_width()+7)/8; + lcp_array.write((const char*)out_buf.data(), cur_wb); + wb += cur_wb; + } + } + // Close file + if(wb%8) { + lcp_array.write("\0\0\0\0\0\0\0\0", 8-wb%8); + } + lcp_array.close(); + file_map["lcp"] = output_filename; + write_R_output("lcp","reordering ","end ", 0, 0); + } // End of phase 2 + remove( filename_lcp_positions.c_str() ); + write_R_output("lcp","construct LCP ","begin", 1, 0); + return true; +} void check_lcp(std::string lcpI, std::string lcpII, std::string id) { diff --git a/test/LcpConstructTest.cpp b/test/LcpConstructTest.cpp new file mode 100644 index 0000000..b62a064 --- /dev/null +++ b/test/LcpConstructTest.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include "sdsl/config.hpp" // for CMAKE_SOURCE_DIR +#include "gtest/gtest.h" +#include +#include + +namespace +{ + +typedef sdsl::int_vector<>::size_type size_type; +typedef std::map tMSFP; + + +// The fixture for testing class int_vector. +class LcpConstructTest : public ::testing::Test +{ + protected: + + LcpConstructTest() { + // You can do set-up work for each test here. + } + + virtual ~LcpConstructTest() { + // You can do clean-up work that doesn't throw exceptions here. + } + + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: + virtual void SetUp() { + // Code here will be called immediately after the constructor (right + // before each test). + string test_cases_dir = string(SDSL_XSTR(CMAKE_SOURCE_DIR)) + "/test/test_cases"; + test_cases.push_back(test_cases_dir + "/crafted/100a.txt"); +// test_cases.push_back(test_cases_dir + "/crafted/abc_abc_abc.txt"); +// test_cases.push_back(test_cases_dir + "/crafted/abc_abc_abc2.txt"); //TODO check the problem with this one + test_cases.push_back(test_cases_dir + "/crafted/empty.txt"); + test_cases.push_back(test_cases_dir + "/crafted/example01.txt"); + test_cases.push_back(test_cases_dir + "/small/faust.txt"); + test_cases.push_back(test_cases_dir + "/small/zarathustra.txt"); + +// lcp_function["construct_lcp_semi_extern_PHI"] = &sdsl::construct_lcp_semi_extern_PHI; // TODO: Handle empty test case +// lcp_function["construct_lcp_PHI"] = &sdsl::construct_lcp_PHI; TODO: Handle default argument + lcp_function["construct_lcp_simple2_9n"] = &sdsl::construct_lcp_simple2_9n; + lcp_function["construct_lcp_go"] = &sdsl::construct_lcp_go; + lcp_function["construct_lcp_goPHI"] = &sdsl::construct_lcp_goPHI; + lcp_function["construct_lcp_go2"] = &sdsl::construct_lcp_go2; + lcp_function["construct_lcp_bwt_based"] = &sdsl::construct_lcp_bwt_based; + lcp_function["construct_lcp_bwt_based2"] = &sdsl::construct_lcp_bwt_based2; + + + string dir = ""; + sdsl::tMSS tmp_file_map; + for (size_t i=0; i< this->test_cases.size(); ++i) { + string id = "tc_"+sdsl::util::to_string(i); + // Prepare Input + { + sdsl::int_vector_file_buffer<8> text_buf; + text_buf.load_from_plain(this->test_cases[i].c_str()); + text_buf.reset(); + unsigned char *text = NULL; + ASSERT_EQ(true, sdsl::util::load_from_int_vector_buffer(text, text_buf)); + size_type n = text_buf.int_vector_size; + ASSERT_EQ(true, sdsl::util::store_to_file(sdsl::char_array_serialize_wrapper<>((unsigned char*)text,n+1), (dir+"text_"+id).c_str() )); + tmp_file_map["text"] = (dir+"text_"+id).c_str(); + delete [] text; + } + + // Create Suffix-Array + { + sdsl::int_vector_file_buffer<8> text_buf( tmp_file_map["text"].c_str() ); + text_buf.reset(); + size_type n = text_buf.int_vector_size; + + unsigned char *text = NULL; + ASSERT_EQ(true, sdsl::util::load_from_int_vector_buffer(text, text_buf)); + sdsl::int_vector<> sa = sdsl::int_vector<>(n, 0, sdsl::bit_magic::l1BP(n+1)+1); + sdsl::algorithm::calculate_sa(text,n, sa); + assert(sa.size() == n); + ASSERT_EQ(true, sdsl::util::store_to_file(sa, (dir+"sa_"+id).c_str() )); + tmp_file_map["sa"] = dir+"sa_"+id; + { + sa.resize(0); + sdsl::int_vector<> temp; + temp.swap(sa); + } + } + + // Construct BWT + { + ASSERT_EQ(true, sdsl::construct_bwt(tmp_file_map, "", id)); + } + + // Construct LCP-Array + { + ASSERT_EQ(true, sdsl::construct_lcp_kasai(tmp_file_map, dir, "org_" + id)); + } + + // Save needed data structures and delete not needed data structes + file_map["text_"+id] = tmp_file_map["text"]; + tmp_file_map.erase("text"); + file_map["sa_"+id] = tmp_file_map["sa"]; + tmp_file_map.erase("sa"); + file_map["bwt_"+id] = tmp_file_map["bwt"]; + tmp_file_map.erase("bwt"); + file_map["lcp_"+id] = tmp_file_map["lcp"]; + tmp_file_map.erase("lcp"); + sdsl::util::delete_all_files(tmp_file_map); + } + } + + virtual void TearDown() { + // Code here will be called immediately after each test (right + // before the destructor). + sdsl::util::delete_all_files(file_map); + } + + // Objects declared here can be used by all tests in the test case for Foo. + std::vector test_cases; + sdsl::tMSS file_map; + tMSFP lcp_function; +}; + +TEST_F(LcpConstructTest, construct_lcp) +{ + string dir = ""; + for (tMSFP::const_iterator it = this->lcp_function.begin(), end = this->lcp_function.end(); it != end; ++it) { + sdsl::tMSS file_map; + for (size_t i=0; i< this->test_cases.size(); ++i) { +// std::cout << (it->first) << " on test file " << this->test_cases[i] << std::endl; + + // Prepare LCP-Array construction + string id = "tc_"+sdsl::util::to_string(i); + file_map["text"] = this->file_map["text_"+id]; + file_map["sa"] = this->file_map["sa_"+id]; + file_map["bwt"] = this->file_map["bwt_"+id]; + + // Construct LCP-Array + ASSERT_EQ(true, (it->second)(file_map, dir, id)) + << (it->first) << " on test file " << this->test_cases[i] << "was not successfull."; + + // Check LCP-Array + sdsl::int_vector<> lcp1, lcp2; + ASSERT_EQ(true, sdsl::util::load_from_file(lcp1, this->file_map["lcp_"+id].c_str())) + << (it->first) << " on test file " << this->test_cases[i] << " could not load reference lcp array"; + ASSERT_EQ(true, sdsl::util::load_from_file(lcp2, file_map["lcp"].c_str())) + << (it->first) << " on test file " << this->test_cases[i] << " could not load created lcp array"; + ASSERT_EQ(lcp1.size(), lcp2.size()) + << (it->first) << " on test file " << this->test_cases[i] << " size differ"; + for(size_type i=0; ifirst) << " on test file " << this->test_cases[i] << " value differ:" + << " lcp1[" << i << "]=" << lcp1[i] << "!=" << lcp2[i] << "=lcp2["<< i << "]"; + } + + // Clean up everything + file_map.erase("text"); + file_map.erase("sa"); + file_map.erase("bwt"); + sdsl::util::delete_all_files(file_map); + } + } +} + +} // namespace + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}