SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct_lcp.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CONSTRUCT_LCP
9#define INCLUDED_SDSL_CONSTRUCT_LCP
10
11#include <algorithm>
12#include <assert.h>
13#include <iostream>
14#include <memory>
15#include <queue>
16#include <stdint.h>
17#include <string>
18#include <utility>
19#include <vector>
20
21#include <sdsl/bits.hpp>
22#include <sdsl/config.hpp>
25#include <sdsl/int_vector.hpp>
27#include <sdsl/io.hpp>
31#include <sdsl/util.hpp>
32#include <sdsl/wt_algorithm.hpp>
33#include <sdsl/wt_huff.hpp>
34
35//#define STUDY_INFORMATIONS
36
37namespace sdsl
38{
39
40// Forward declaration of construction method
41// template <class t_index>
42// void construct(t_index& idx, const std::string& file);
43
45
62template <uint8_t t_width>
64{
65 static_assert(t_width == 0 or t_width == 8,
66 "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
67 int_vector<> lcp;
69 construct_isa(config);
70 {
73 {
74 return;
75 }
77 std::ios::in,
78 1000000); // init isa file_buffer
79 int_vector<> sa;
80 if (!load_from_cache(sa, conf::KEY_SA, config))
81 {
82 return;
83 }
84 // use Kasai algorithm to compute the lcp values
85 for (size_type i = 0, j = 0, sa_1 = 0, l = 0, n = isa_buf.size(); i < n; ++i)
86 {
87 sa_1 = isa_buf[i]; // = isa[i]
88 if (sa_1)
89 {
90 j = sa[sa_1 - 1];
91 if (l)
92 --l;
93 assert(i != j);
94 while (text[i + l] == text[j + l])
95 { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
96 ++l;
97 }
98 sa[sa_1 - 1] = l; // overwrite sa array with lcp values
99 }
100 else
101 {
102 l = 0;
103 sa[n - 1] = 0;
104 }
105 }
106
107 for (size_type i = sa.size(); i > 1; --i)
108 {
109 sa[i - 1] = sa[i - 2];
110 }
111 sa[0] = 0;
112 lcp = std::move(sa);
113 }
114 store_to_cache(lcp, conf::KEY_LCP, config);
115}
116
118
133template <uint8_t t_width>
135{
136 static_assert(t_width == 0 or t_width == 8,
137 "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
139 typedef int_vector<t_width> text_type;
142 size_type n = sa_buf.size();
143
144 assert(n > 0);
145 if (1 == n)
146 { // Handle special case: Input only the sentinel character.
147 int_vector<> lcp(1, 0);
148 store_to_cache(lcp, conf::KEY_LCP, config);
149 return;
150 }
151
152 // (1) Calculate PHI (stored in array plcp)
153 int_vector<> plcp(n, 0, sa_buf.width());
154 for (size_type i = 0, sai_1 = 0; i < n; ++i)
155 {
156 size_type sai = sa_buf[i];
157 plcp[sai] = sai_1;
158 sai_1 = sai;
159 }
160
161 // (2) Load text from disk
162 text_type text;
163 load_from_cache(text, KEY_TEXT, config);
164
165 // (3) Calculate permuted LCP array (text order), called PLCP
166 size_type max_l = 0;
167 for (size_type i = 0, l = 0; i < n - 1; ++i)
168 {
169 size_type phii = plcp[i];
170 while (text[i + l] == text[phii + l])
171 {
172 ++l;
173 }
174 plcp[i] = l;
175 if (l)
176 {
177 max_l = std::max(max_l, l);
178 --l;
179 }
180 }
181 util::clear(text);
182 uint8_t lcp_width = bits::hi(max_l) + 1;
183
184 // (4) Transform PLCP into LCP
185 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
186 size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
187 int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp
188 lcp_buf[0] = 0;
189 sa_buf.buffersize(buffer_size);
190 for (size_type i = 1; i < n; ++i)
191 {
192 size_type sai = sa_buf[i];
193 lcp_buf[i] = plcp[sai];
194 }
195 lcp_buf.close();
197}
198
200
217{
220 size_type n = sa_buf.size();
221 if (1 == n)
222 {
223 int_vector<> lcp(1, 0);
224 store_to_cache(lcp, conf::KEY_LCP, config);
225 return;
226 }
227 const uint8_t log_q = 6; // => q=64
228 const uint32_t q = 1 << log_q;
229 const uint64_t modq = bits::lo_set[log_q];
230
231 // n-1 is the maximum entry in SA
232 int_vector<64> plcp((n - 1 + q) >> log_q);
233
234 for (size_type i = 0, sai_1 = 0; i < n; ++i)
235 { // we can start at i=0. if SA[i]%q==0
236 // we set PHI[(SA[i]=n-1)%q]=0, since T[0]!=T[n-1]
237 size_type sai = sa_buf[i];
238 if ((sai & modq) == 0)
239 {
240 if ((sai >> log_q) >= plcp.size())
241 {
242 // std::cerr<<"sai="<<sai<<" log_q="<<log_q<<" sai>>log_q="<<(sai>>log_q)<<"
243 // "<<sai_1<<std::endl; std::cerr<<"n="<<n<<" "<<" plcp.size()="<<plcp.size();
244 }
245 plcp[sai >> log_q] = sai_1;
246 }
247 sai_1 = sai;
248 }
249
250 int_vector<8> text;
251 load_from_cache(text, conf::KEY_TEXT, config);
252
253 for (size_type i = 0, j, k, l = 0; i < plcp.size(); ++i)
254 {
255 j = i << log_q; // j=i*q
256 k = plcp[i];
257 while (text[j + l] == text[k + l])
258 ++l;
259 plcp[i] = l;
260 if (l >= q)
261 {
262 l -= q;
263 }
264 else
265 {
266 l = 0;
267 }
268 }
269
270 size_type buffer_size = 4000000; // buffer_size is a multiple of 8!
271 sa_buf.buffersize(buffer_size);
273 std::ios::out,
274 buffer_size,
275 sa_buf.width()); // open buffer for plcp
276
277 for (size_type i = 0, sai_1 = 0, l = 0, sai = 0, iq = 0; i < n; ++i)
278 {
279 /*size_type*/ sai = sa_buf[i];
280 // std::cerr<<"i="<<i<<" sai="<<sai<<std::endl;
281 if ((sai & modq) == 0)
282 { // we have already worked the value out ;)
283 lcp_out_buf[i] = l = plcp[sai >> log_q];
284 }
285 else
286 {
287 /*size_type*/ iq = sai & bits::lo_unset[log_q];
288 l = plcp[sai >> log_q];
289 if (l > (sai - iq))
290 l -= (sai - iq);
291 else
292 l = 0;
293 while (text[sai + l] == text[sai_1 + l])
294 ++l;
295 lcp_out_buf[i] = l;
296 }
297#ifdef CHECK_LCP
298 size_type j = 0;
299 for (j = 0; j < l; ++j)
300 {
301 if (text[sai + j] != text[sai_1 + j])
302 {
303 std::cout << "lcp[" << i << "]=" << l << " is two big! " << j << " is right!"
304 << " sai=" << sai << std::endl;
305 if ((sai & modq) != 0)
306 std::cout << " plcp[sai>>log_q]=" << plcp[sai >> log_q] << " sai-iq=" << sai - iq << " sai=" << sai
307 << " sai-iq=" << sai - iq << std::endl;
308 break;
309 }
310 }
311#endif
312 sai_1 = sai;
313 }
314 lcp_out_buf.close();
316 return;
317}
318
320
338inline void construct_lcp_go(cache_config & config)
339{
341#ifdef STUDY_INFORMATIONS
342 size_type racs = 0; // random accesses to the text
343 size_type matches = 0;
344 size_type comps2 = 0; // comparisons the second phase
345#endif
346 int_vector<8> text;
347 load_from_cache(text, conf::KEY_TEXT, config);
348 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
349 const size_type n = sa_buf.size();
350 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
351
352 if (1 == n)
353 {
354 int_vector<> lcp(1, 0);
355 store_to_cache(lcp, conf::KEY_LCP, config);
356 return;
357 }
358
359 size_type cnt_c[257] = {0}; // counter for each character in the text
360 size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
361 size_type cnt_cc2[257] = {0}; //
362 size_type omitted_c[257] = {0}; // counts the omitted occurrences for the second phase
363 size_type prev_occ_in_bwt[256] = {0}; // position of the previous occurrence of each character c in the bwt
364 for (size_type i = 0; i < 256; ++i)
365 prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
366 unsigned char alphabet[257] = {0};
367 uint8_t sigma = 0;
368
369 tLI m_list[2][256];
370 size_type m_char_count[2] = {0};
371 uint8_t m_chars[2][256] = {{0}, {0}};
372
373 size_type nn = 0; // n' for phase 2
374 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
375 {
376
377 int_vector<8> lcp_sml(n,
378 0); // initialize array for small values of first phase; note lcp[0]=0
379
380 for (size_type i = 0; i < n; ++i)
381 { // initialize cnt_c
382 ++cnt_c[text[i] + 1];
383 }
384 for (int i = 1; i < 257; ++i)
385 { // calculate sigma and initailize cnt_cc
386 if (cnt_c[i] > 0)
387 {
388 alphabet[sigma++] = (unsigned char)(i - 1);
389 }
390 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
391 }
392 alphabet[sigma] = '\0';
393 {
394 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
395 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
396 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
397 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
398 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
399 ++omitted_c[alphabet[0]]; //
400
401 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
402 rmq_stack[0] = 0;
403 rmq_stack[1] = 0; // first element (-1, -1)
404 rmq_stack[2] = 1;
405 rmq_stack[3] = 0; // second element (0, -1)
406 size_type rmq_end = 3; // index of the value of the topmost element
407
408 const size_type m_mod2 = m % 2;
409 uint8_t cur_c = alphabet[1];
410 size_type big_val = 0;
411 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
412 {
413 uint8_t bwti = bwt_buf[i];
414 sai = sa_buf[i];
415 size_type lf = cnt_cc[bwti];
416 if (!cur_c_cnt)
417 { // cur_c_cnt==0, if there is no more occurence of the current character
418 if (cur_c_cnt < sigma)
419 {
420 cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1];
421 }
422 }
423 size_type l = 0;
424 if (i >= cnt_cc[cur_c])
425 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
426 if (bwti == bwti_1 and lf < i)
427 { // BWT[i]==BWT[i-1]
428 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
429 if (l == m)
430 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
431 l += (text[sai_1 + m] == text[sai + m]);
432#ifdef STUDY_INFORMATIONS
433 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
434 ++racs;
435#endif
436 }
437 lcp_sml[i] = l;
438 }
439 else
440 { // BWT[i] != BWT[i-1] or LF[i] > i
441 if (lf < i)
442 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
443#ifdef STUDY_INFORMATIONS
444 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
445 ++racs;
446#endif
447 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
448 {
449 ++l;
450#ifdef STUDY_INFORMATIONS
451 ++matches;
452#endif
453 }
454 lcp_sml[i] = l;
455 }
456 }
457 else
458 { // if already done
459 l = lcp_sml[i]; // load LCP value
460 }
461 if (l > m)
462 {
463 ++big_val;
464 if (i > 10000 and i < 10500 and big_val > 3000)
465 { // if most of the values are big: switch to PHI algorithm
466 util::clear(text);
467 util::clear(lcp_sml);
468 construct_lcp_PHI<8>(config);
469 return;
470 }
471 }
472 // invariant: l <= m+1
473 // begin update rmq_stack
474 size_type x = l + 1;
475 size_type j = rmq_end;
476 while (x <= rmq_stack[j])
477 j -= 2; // pop all elements with value >= l
478 rmq_stack[++j] = i + 1; // push position i
479 rmq_stack[++j] = x; // push value l
480 rmq_end = j; // update index of the value of the topmost element
481 if (lf > i)
482 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
483 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
484 // rmq is linear in the stack size; can also be implemented with binary search on the stack
485 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
486 j = rmq_end - 3;
487 while (x_pos <= rmq_stack[j])
488 j -= 2; // search smallest value in the interval I
489 lcp_sml[lf] =
490 rmq_stack[j + 3] - (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
491 }
492 if (l >= m)
493 {
494 if (l == m)
495 push_front_m_index(nn, cur_c, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
496 ++nn;
497 }
498 else
499 ++omitted_c[cur_c];
500
501 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
502 ++cnt_cc[bwti]; // update counter and therefore the LF information
503 sai_1 = sai; // update SA[i-1]
504 bwti_1 = bwti; // update BWT[i-1]
505 }
506 }
507 util::clear(text);
508
509 if (n > 1000 and nn > 5 * (n / 6))
510 { // if we would occupy more space than the PHI algorithm => switch to PHI algorithm
511 util::clear(lcp_sml);
512 construct_lcp_PHI<8>(config);
513 return;
514 }
515 store_to_cache(lcp_sml, "lcp_sml", config);
516 }
517#ifdef STUDY_INFORMATIONS
518 std::cout << "# n=" << n << " nn=" << nn << " nn/n=" << ((double)nn) / n << std::endl;
519#endif
520
521 // phase 2: calculate lcp_big
522 {
523 // std::cout<<"# begin calculating LF' values"<<std::endl;
524 int_vector<> lcp_big(nn,
525 0,
526 bits::hi(n - 1) + 1); // lcp_big first contains adapted LF values and finally the big LCP
527 // values
528 {
529 // initialize lcp_big with adapted LF values
530 bit_vector todo(n, 0); // bit_vector todo indicates which values are >= m in lcp_sml
531 {
532 // initialize bit_vector todo
533 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
534 for (size_type i = 0; i < n; ++i)
535 {
536 if (lcp_sml_buf[i] >= m)
537 {
538 todo[i] = 1;
539 }
540 }
541 }
542
543 cnt_cc2[0] = cnt_cc[0] = 0;
544 for (size_type i = 1, omitted_sum = 0; i < 257; ++i)
545 { // initialize cnt_cc and cnt_cc2
546 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
547 omitted_sum += omitted_c[i - 1];
548 cnt_cc2[i] = cnt_cc[i] - omitted_sum;
549 }
550
551 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
552 for (size_type i = 0, i2 = 0; i < n; ++i)
553 {
554 uint8_t b = bwt_buf[i]; // store BWT[i]
555 size_type lf_i = cnt_cc[b]; // LF[i]
556 if (todo[i])
557 { // LCP[i] is a big value
558 if (todo[lf_i])
559 { // LCP[LF[i]] is a big entry
560 lcp_big[i2] = cnt_cc2[b]; // LF'[i]
561 } /*else{
562 * lcp_big[i2] = 0;
563 }*/
564 ++i2;
565 }
566 if (todo[lf_i])
567 { // LCP[LF[i]] is a big entry
568 ++cnt_cc2[b]; // increment counter for adapted LF
569 }
570 ++cnt_cc[b]; // increment counter for LF
571 }
572 }
573
574 // std::cout<<"# begin initializing bwt2, shift_bwt2, run2"<<std::endl;
575 int_vector<8> bwt2(nn),
576 shift_bwt2(nn); // BWT of big LCP values, and shifted BWT of
577 // big LCP values
578 bit_vector run2(nn + 1); // indicates for each entry i, if i and i-1 are both big LCP values
579 run2[nn] = 0; // index nn is not a big LCP value
580 {
581 // initialize bwt2, shift_bwt2, adj2
582 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
583 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
584 uint8_t b_1 = '\0'; // BWT[i-1]
585 bool is_run = false;
586 for (size_type i = 0, i2 = 0; i < n; ++i)
587 {
588 uint8_t b = bwt_buf[i];
589 if (lcp_sml_buf[i] >= m)
590 {
591 bwt2[i2] = b;
592 shift_bwt2[i2] = b_1;
593 run2[i2] = is_run;
594 is_run = true;
595 ++i2;
596 }
597 else
598 {
599 is_run = false;
600 }
601 b_1 = b;
602 }
603 }
604
605 bit_vector todo2(nn + 1, 1); // init all values with 1, except
606 todo2[nn] = 0; // the last one! (handels case "i < nn")
607
608 // std::cout<<"# begin calculating m-indices"<<std::endl;
609 {
610 // calculate m-indices, (m+1)-indices,... until we are done
611 size_type m2 = m;
612 size_type char_ex[256];
613 for (size_type i = 0; i < 256; ++i)
614 char_ex[i] = nn;
615 size_type char_occ = 0;
616 size_type m_mod2 = m2 % 2, mm1_mod2 = (m2 + 1) % 2;
617 while (m_char_count[m_mod2] > 0)
618 { // while there are m-indices, calculate (m+1)-indices and write m-indices
619 // For all values LCP[i] >= m2 it follows that todo2[i] == 1
620 // list m_list[mm1_mod2][b] is sorted in decreasing order
621 ++m2;
622 mm1_mod2 = (m2 + 1) % 2, m_mod2 = m2 % 2;
623 m_char_count[m_mod2] = 0;
624
625 std::sort(m_chars[mm1_mod2],
626 m_chars[mm1_mod2] + m_char_count[mm1_mod2]); // TODO: ersetzen?
627
628 for (size_type mc = 0; mc < m_char_count[mm1_mod2]; ++mc)
629 { // for every character
630 tLI & mm1_mc_list = m_list[mm1_mod2][m_chars[mm1_mod2][m_char_count[mm1_mod2] - 1 - mc]];
631 // size_type old_i = nn;
632 while (!mm1_mc_list.empty())
633 {
634 size_type i = mm1_mc_list.front(); // i in [0..n-1]
635 mm1_mc_list.pop_front();
636 // For all values LCP[i] >= m-1 it follows that todo2[i] == 1
637 for (size_type k = i; todo2[k]; --k)
638 {
639#ifdef STUDY_INFORMATIONS
640 ++comps2;
641#endif
642 uint8_t b = shift_bwt2[k];
643 if (char_ex[b] != i)
644 {
645 char_ex[b] = i;
646 ++char_occ;
647 }
648 if (!run2[k])
649 break;
650 }
651 for (size_type k = i; todo2[k] and char_occ; ++k)
652 {
653#ifdef STUDY_INFORMATIONS
654 ++comps2;
655#endif
656 uint8_t b = bwt2[k];
657 if (char_ex[b] == i)
658 {
659 size_type p = lcp_big[k];
660 push_back_m_index(p, b, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
661 char_ex[b] = nn;
662 --char_occ;
663 }
664 if (!run2[k + 1])
665 break;
666 }
667 lcp_big[i] = m2 - 1;
668 todo2[i] = 0;
669 // old_i = i;
670 }
671 }
672 }
673 }
674
675 store_to_cache(lcp_big, "lcp_big", config);
676 } // end phase 2
677
678 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
679 // phase 3: merge lcp_sml and lcp_big and save to disk
680 {
681 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
682 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
683 config)); // file buffer containing the big LCP values
684 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
685 std::ios::in,
686 buffer_size); // file buffer containing the small LCP values
688 std::ios::out,
689 buffer_size,
690 lcp_big_buf.width()); // buffer for the resulting LCP array
691 for (size_type i = 0, i2 = 0; i < n; ++i)
692 {
693 size_type l = lcp_sml_buf[i];
694 if (l >= m)
695 { // if l >= m it is stored in lcp_big
696 l = lcp_big_buf[i2];
697 ++i2;
698 }
699 lcp_buf[i] = l;
700 }
701 lcp_buf.close();
702 }
704
705#ifdef STUDY_INFORMATIONS
706 std::cout << "# racs: " << racs << std::endl;
707 std::cout << "# matches: " << matches << std::endl;
708 std::cout << "# comps2: " << comps2 << std::endl;
709#endif
710 return;
711}
712
714
733{
735 int_vector<8> text;
736 load_from_cache(text, conf::KEY_TEXT, config); // load text from file system
737 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
738 const size_type n = sa_buf.size();
739 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
740
741 if (1 == n)
742 {
743 int_vector<> lcp(1, 0);
744 store_to_cache(lcp, conf::KEY_LCP, config);
745 return;
746 }
747
748 size_type cnt_c[257] = {0}; // counter for each character in the text
749 size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
750 size_type omitted_c[257] = {0}; // counts the omitted occurrences for the second phase
751 size_type prev_occ_in_bwt[256] = {0}; // position of the previous occurrence of each character c in the bwt
752 for (size_type i = 0; i < 256; ++i)
753 prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
754 unsigned char alphabet[257] = {0};
755 uint8_t sigma = 0;
756
757 size_type nn = 0; // n' for phase 2
758 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
759 {
760
761 int_vector<8> lcp_sml(n,
762 0); // initialize array for small values of first phase; note lcp[0]=0
763
764 for (size_type i = 0; i < n; ++i)
765 { // initialize cnt_c
766 ++cnt_c[text[i] + 1];
767 }
768 for (int i = 1; i < 257; ++i)
769 { // calculate sigma and initailize cnt_cc
770 if (cnt_c[i] > 0)
771 {
772 alphabet[sigma++] = (unsigned char)(i - 1);
773 }
774 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
775 }
776 alphabet[sigma] = '\0';
777 {
778 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
779 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
780 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
781 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
782 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
783 ++omitted_c[alphabet[0]]; //
784
785 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
786 rmq_stack[0] = 0;
787 rmq_stack[1] = 0; // first element (-1, -1)
788 rmq_stack[2] = 1;
789 rmq_stack[3] = 0; // second element (0, -1)
790 size_type rmq_end = 3; // index of the value of the topmost element
791
792 uint8_t cur_c = alphabet[1];
793 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
794 {
795 uint8_t bwti = bwt_buf[i];
796 sai = sa_buf[i];
797 size_type lf = cnt_cc[bwti];
798 if (!cur_c_cnt)
799 { // cur_c_cnt==0, if there is no more occurence of the current character
800 if (cur_c_cnt < sigma)
801 {
802 cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1];
803 }
804 }
805 size_type l = 0;
806 if (i >= cnt_cc[cur_c])
807 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
808 if (bwti == bwti_1 and lf < i)
809 { // BWT[i]==BWT[i-1]
810 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
811 if (l == m)
812 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
813 l += (text[sai_1 + m] == text[sai + m]);
814 }
815 lcp_sml[i] = l;
816 }
817 else
818 { // BWT[i] != BWT[i-1] or LF[i] > i
819 if (lf < i)
820 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
821 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
822 {
823 ++l;
824 }
825 lcp_sml[i] = l;
826 }
827 }
828 else
829 { // if already done
830 l = lcp_sml[i]; // load LCP value
831 }
832 // invariant: l <= m+1
833 // begin update rmq_stack
834 size_type x = l + 1;
835 size_type j = rmq_end;
836 while (x <= rmq_stack[j])
837 j -= 2; // pop all elements with value >= l
838 rmq_stack[++j] = i + 1; // push position i
839 rmq_stack[++j] = x; // push value l
840 rmq_end = j; // update index of the value of the topmost element
841 if (lf > i)
842 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
843 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
844 // rmq is linear in the stack size; can also be implemented with binary search on the stack
845 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
846 j = rmq_end - 3;
847 while (x_pos <= rmq_stack[j])
848 j -= 2; // search smallest value in the interval I
849 lcp_sml[lf] =
850 rmq_stack[j + 3] - (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
851 }
852 if (l > m)
853 {
854 ++nn;
855 }
856 else
857 ++omitted_c[cur_c];
858
859 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
860 ++cnt_cc[bwti]; // update counter and therefore the LF information
861 sai_1 = sai; // update SA[i-1]
862 bwti_1 = bwti; // update BWT[i-1]
863 }
864 }
865 store_to_cache(lcp_sml, "lcp_sml", config);
866 }
867
868 // phase 2: calculate lcp_big with PHI algorithm on remaining entries of LCP
869 {
870 int_vector<> lcp_big(0, 0, bits::hi(n - 1) + 1); // nn, 0, bits::hi(n-1)+1);
871 {
872
873 memory_monitor::event("lcp-init-phi-begin");
874 size_type sa_n_1 = 0; // value for SA[n-1]
875 bit_vector todo(n, 0); // bit_vector todo indicates which values are > m in lcp_sml
876 {
877 // initialize bit_vector todo
878 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
879 for (size_type i = 0; i < n; ++i)
880 {
881 if (lcp_sml_buf[i] > m)
882 {
883 todo[sa_buf[i]] = 1;
884 }
885 }
886 sa_n_1 = sa_buf[n - 1];
887 }
888 rank_support_v<> todo_rank(&todo); // initialize rank for todo
889
890 const size_type bot = sa_n_1;
891 int_vector<> phi(nn, bot, bits::hi(n - 1) + 1); // phi
892
893 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
894 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
895 uint8_t b_1 = 0;
896 for (size_type i = 0, sai_1 = 0; i < n; ++i)
897 { // initialize phi
898 uint8_t b = bwt_buf[i]; // store BWT[i]
899 size_type sai = sa_buf[i];
900 if (lcp_sml_buf[i] > m and b != b_1)
901 { // if i is a big irreducable value
902 phi[todo_rank(sai)] = sai_1;
903 } // otherwise phi is equal to bot
904 b_1 = b;
905 sai_1 = sai;
906 }
907 memory_monitor::event("lcp-init-phi-end");
908
909 memory_monitor::event("lcp-calc-plcp-begin");
910 for (size_type i = 0, ii = 0, l = m + 1, p = 0; i < n and ii < nn; ++i)
911 { // execute compact Phi algorithm
912 if (todo[i])
913 {
914 if (i > 0 and todo[i - 1])
915 l = l - 1;
916 else
917 l = m + 1;
918 if ((p = phi[ii]) != bot)
919 {
920 while (text[i + l] == text[p + l])
921 ++l;
922 }
923 phi[ii++] = l;
924 }
925 }
926 memory_monitor::event("lcp-calc-plcp-end");
927 util::clear(text);
928
929 memory_monitor::event("lcp-calc-lcp-begin");
930 lcp_big.resize(nn);
931 for (size_type i = 0, ii = 0; i < n and ii < nn; ++i)
932 {
933 if (lcp_sml_buf[i] > m)
934 {
935 lcp_big[ii++] = phi[todo_rank(sa_buf[i])];
936 }
937 }
938 memory_monitor::event("lcp-calc-lcp-end");
939 }
940 store_to_cache(lcp_big, "lcp_big", config);
941 } // end phase 2
942
943 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
944 // phase 3: merge lcp_sml and lcp_big and save to disk
945 {
946 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
947 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
948 config)); // file buffer containing the big LCP values
949 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
950 std::ios::in,
951 buffer_size); // file buffer containing the small LCP values
953 std::ios::out,
954 buffer_size,
955 lcp_big_buf.width()); // file buffer for the resulting LCP array
956
957 for (size_type i = 0, i2 = 0; i < n; ++i)
958 {
959 size_type l = lcp_sml_buf[i];
960 if (l > m)
961 { // if l > m it is stored in lcp_big
962 l = lcp_big_buf[i2];
963 ++i2;
964 }
965 lcp_buf[i] = l;
966 }
967 lcp_big_buf.close(true); // close buffer and remove file
968 lcp_sml_buf.close(true); // close buffer and remove file
969 }
971 return;
972}
973
975
990template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
992{
994 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
995
996 // create WT
997 memory_monitor::event("lcp-bwt-create-wt-huff-begin");
998 t_wt wt_bwt;
999 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1000 uint64_t n = wt_bwt.size();
1001 memory_monitor::event("lcp-bwt-create-wt-huff-end");
1002
1003 // init
1004 memory_monitor::event("lcp-bwt-init-begin");
1005 size_type lcp_value = 0; // current LCP value
1006 size_type lcp_value_offset = 0; // Largest LCP value in LCP array, that was written on disk
1007 size_type phase = 0; // Count how often the LCP array was written on disk
1008
1009 size_type intervals = 0; // number of intervals which are currently stored
1010 size_type intervals_new = 0; // number of new intervals
1011
1012 std::queue<size_type> q; // Queue for storing the intervals
1013 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1014 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1015 bool queue_used = true;
1016 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1017 // else use dictionary and wavelet tree
1018
1019 size_type quantity; // quantity of characters in interval
1020 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1021 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1022 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1023
1024 // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at
1025 // moste 8 bit
1026 size_type bb = (n * 20 - size_in_bytes(wt_bwt) * 8 * 1.25 - 5 * n)
1027 / n; // 20n - size of wavelet tree * 1.25 for rank support - 5n for bit arrays - n for index_done array
1028 if (n * 20 < size_in_bytes(wt_bwt) * 8 * 1.25 + 5 * n)
1029 {
1030 bb = 6;
1031 }
1032 bb = std::min(bb, (size_type)8);
1033
1034 size_type lcp_value_max = (1ULL << bb) - 1; // Largest LCP value that can be stored into the LCP array
1035 size_type space_in_bit_for_lcp = n * bb; // Space for the LCP array in main memory
1036
1037#ifdef STUDY_INFORMATIONS
1038 std::cout << "# l=" << n << " b=" << (int)bb << " lcp_value_max=" << lcp_value_max
1039 << " size_in_bytes(wt_bwt<v,bs,bs>)=" << size_in_bytes(wt_bwt) << std::endl;
1040#endif
1041
1042 // init partial_lcp
1043 int_vector<> partial_lcp(n, 0, bb); // LCP array
1044
1045 // init index_done
1046 bit_vector index_done(n + 1, false); // bit_vector indicates if entry LCP[i] is amend
1047 rank_support_v<> ds_rank_support; // Rank support for bit_vector index_done
1048
1049 // create C-array
1050 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1051 create_C_array(C, wt_bwt);
1052 memory_monitor::event("lcp-bwt-init-begin-end");
1053 // calculate lcp
1054 memory_monitor::event("lcp-bwt-calc-values-begin");
1055
1056 // calculate first intervals
1057 partial_lcp[0] = 0;
1058 index_done[0] = true;
1059 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1060 for (size_type i = 0; i < quantity; ++i)
1061 {
1062 unsigned char c = cs[i];
1063 size_type a_new = C[c] + rank_c_i[i];
1064 size_type b_new = C[c] + rank_c_j[i];
1065
1066 // Save LCP value if not seen before
1067 if (!index_done[b_new])
1068 {
1069 if (b_new < n)
1070 partial_lcp[b_new] = lcp_value;
1071 index_done[b_new] = true;
1072 // Save interval
1073 q.push(a_new);
1074 q.push(b_new);
1075 ++intervals;
1076 }
1077 }
1078 ++lcp_value;
1079
1080 // calculate LCP values phase by phase
1081 while (intervals)
1082 {
1083 if (intervals < use_queue_and_wt && !queue_used)
1084 {
1085 memory_monitor::event("lcp-bwt-bitvec2queue-begin");
1086 util::clear(dict[target]);
1087
1088 // copy from bitvector to queue
1089 size_type a2 = util::next_bit(dict[source], 0);
1090 size_type b2 = util::next_bit(dict[source], a2 + 1);
1091 while (b2 < dict[source].size())
1092 {
1093 q.push((a2 - 1) >> 1);
1094 q.push(b2 >> 1);
1095 // get next interval
1096 a2 = util::next_bit(dict[source], b2 + 1);
1097 b2 = util::next_bit(dict[source], a2 + 1);
1098 }
1099 util::clear(dict[source]);
1100 memory_monitor::event("lcp-bwt-bitvec2queue-end");
1101 }
1102 if (intervals >= use_queue_and_wt && queue_used)
1103 {
1104 memory_monitor::event("lcp-bwt-queue2bitvec-begin");
1105 dict[source].resize(2 * (n + 1));
1106
1107 util::set_to_value(dict[source], 0);
1108 // copy from queue to bitvector
1109 while (!q.empty())
1110 {
1111 dict[source][(q.front() << 1) + 1] = 1;
1112 q.pop();
1113 dict[source][(q.front() << 1)] = 1;
1114 q.pop();
1115 }
1116 dict[target].resize(2 * (n + 1));
1117
1118 util::set_to_value(dict[target], 0);
1119 memory_monitor::event("lcp-bwt-queue2bitvec-end");
1120 }
1121
1122 if (intervals < use_queue_and_wt)
1123 {
1124 queue_used = true;
1125 intervals_new = 0;
1126 while (intervals)
1127 {
1128 // get next interval
1129 size_type a = q.front();
1130 q.pop();
1131 size_type b = q.front();
1132 q.pop();
1133 --intervals;
1134
1135 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1136 for (size_type i = 0; i < quantity; ++i)
1137 {
1138 unsigned char c = cs[i];
1139 size_type a_new = C[c] + rank_c_i[i];
1140 size_type b_new = C[c] + rank_c_j[i];
1141
1142 // Save LCP value if not seen before
1143 if (!index_done[b_new] and phase == 0)
1144 {
1145 partial_lcp[b_new] = lcp_value;
1146 index_done[b_new] = true;
1147 // Save interval
1148 q.push(a_new);
1149 q.push(b_new);
1150 ++intervals_new;
1151 }
1152 else if (!index_done[b_new])
1153 {
1154 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1155 if (!partial_lcp[insert_pos])
1156 {
1157 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1158 // Save interval
1159 q.push(a_new);
1160 q.push(b_new);
1161 ++intervals_new;
1162 }
1163 }
1164 }
1165 }
1166 intervals = intervals_new;
1167 }
1168 else
1169 {
1170 queue_used = false;
1171 intervals = 0;
1172
1173 // get next interval
1174 size_type a2 = util::next_bit(dict[source], 0);
1175 size_type b2 = util::next_bit(dict[source], a2 + 1);
1176
1177 while (b2 < dict[source].size())
1178 {
1179 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1180 for (size_type i = 0; i < quantity; ++i)
1181 {
1182 unsigned char c = cs[i];
1183 size_type a_new = C[c] + rank_c_i[i];
1184 size_type b_new = C[c] + rank_c_j[i];
1185 // Save LCP value if not seen before
1186 if (!index_done[b_new] and phase == 0)
1187 {
1188 partial_lcp[b_new] = lcp_value;
1189 index_done[b_new] = true;
1190 // Save interval
1191 dict[target][(a_new << 1) + 1] = 1;
1192 dict[target][(b_new << 1)] = 1;
1193 ++intervals;
1194 }
1195 else if (!index_done[b_new])
1196 {
1197 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1198 if (!partial_lcp[insert_pos])
1199 {
1200 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1201 // Save interval
1202 dict[target][(a_new << 1) + 1] = 1;
1203 dict[target][(b_new << 1)] = 1;
1204 ++intervals;
1205 }
1206 }
1207 }
1208 // get next interval
1209 a2 = util::next_bit(dict[source], b2 + 1);
1210 b2 = util::next_bit(dict[source], a2 + 1);
1211 }
1212 std::swap(source, target);
1213 util::set_to_value(dict[target], 0);
1214 }
1215 ++lcp_value;
1216 if (lcp_value >= lcp_value_max)
1217 {
1218 memory_monitor::event("lcp-bwt-write-to-file-begin");
1219 if (phase)
1220 {
1221 insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset);
1222 }
1223 else
1224 {
1225 store_to_file(partial_lcp, lcp_file);
1226 }
1227 memory_monitor::event("lcp-bwt-write-to-file-end");
1228 memory_monitor::event("lcp-bwt-resize-variables-begin");
1229 util::init_support(ds_rank_support, &index_done); // Create rank support
1230
1231 // Recalculate lcp_value_max and resize partial_lcp
1232 lcp_value_offset = lcp_value_max - 1;
1233 size_type remaining_lcp_values = index_done.size() - ds_rank_support.rank(index_done.size());
1234
1235 uint8_t int_width_new =
1236 std::min(space_in_bit_for_lcp / remaining_lcp_values, (size_type)bits::hi(n - 1) + 1);
1237 lcp_value_max = lcp_value_offset + (1ULL << int_width_new);
1238#ifdef STUDY_INFORMATIONS
1239 std::cout << "# l=" << remaining_lcp_values << " b=" << (int)int_width_new
1240 << " lcp_value_max=" << lcp_value_max << std::endl;
1241#endif
1242 // partial_lcp = int_vector<>(remaining_lcp_values, 0, int_width_new);
1243 partial_lcp.width(int_width_new);
1244 partial_lcp.resize(remaining_lcp_values);
1245 partial_lcp.shrink_to_fit();
1246 util::set_to_value(partial_lcp, 0);
1247 ++phase;
1248 memory_monitor::event("lcp-bwt-resize-variables-end");
1249 }
1250 }
1251 memory_monitor::event("lcp-bwt-calc-values-end");
1252
1253 // merge to file
1254 memory_monitor::event("lcp-bwt-merge-to-file-begin");
1255 if (phase)
1256 {
1257 insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset);
1258 }
1259 else
1260 {
1261 store_to_file(partial_lcp, lcp_file);
1262 }
1264
1265 memory_monitor::event("lcp-bwt-merge-to-file-end");
1266}
1267
1269
1284template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
1286{
1288
1289 uint64_t n; // Input length
1290 size_type buffer_size = 1000000; // Size of the buffer
1291 size_type lcp_value = 0; // current LCP value
1292 std::string tmp_lcp_file = cache_file_name(conf::KEY_LCP, config) + "_tmp";
1293 // (1) Calculate LCP-Positions-Array: For each lcp_value (in ascending order) all its occurrences (in any order) in
1294 // the lcp array
1295 {
1296 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1297 t_wt wt_bwt;
1298 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1299 n = wt_bwt.size();
1300 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1301
1302 // Declare needed variables
1303 memory_monitor::event("lcp-bwt2-init-begin");
1304 size_type intervals = 0; // Number of intervals which are currently stored
1305 size_type intervals_new = 0; // Number of new intervals
1306
1307 std::queue<size_type> q; // Queue for storing the intervals
1308 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1309 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1310 bool queue_used = true; // Defines whether a queue (true) or the bit_vectors (false) was used to store intervals
1311 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1312 // else use dictionary and wavelet tree
1313
1314 size_type quantity; // quantity of characters in interval
1315 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1316 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1317 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1318
1319 // External storage of LCP-Positions-Array
1320 bool new_lcp_value = false;
1321 uint8_t int_width = bits::hi(n) + 2;
1322 int_vector_buffer<> lcp_positions_buf(tmp_lcp_file,
1323 std::ios::out,
1324 buffer_size,
1325 int_width); // Create buffer for positions of LCP entries
1326 size_type idx_out_buf = 0;
1327 bit_vector index_done(n + 1, 0); // Bitvector which is true, if corresponding LCP value was already calculated
1328
1329 // Create C-array
1330 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1331 create_C_array(C, wt_bwt);
1332 memory_monitor::event("lcp-bwt2-init-end");
1333 // Calculate LCP-Positions-Array
1334 memory_monitor::event("lcp-bwt2-calc-values-begin");
1335
1336 // Save position of first LCP-value
1337 lcp_positions_buf[idx_out_buf++] = 0;
1338 if (new_lcp_value)
1339 {
1340 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1341 new_lcp_value = false;
1342 }
1343 index_done[0] = true;
1344
1345 // calculate first intervals
1346 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1347 for (size_type i = 0; i < quantity; ++i)
1348 {
1349 unsigned char c = cs[i];
1350 size_type a_new = C[c] + rank_c_i[i];
1351 size_type b_new = C[c] + rank_c_j[i];
1352
1353 // Save LCP value and corresponding interval if not seen before
1354 if (!index_done[b_new])
1355 {
1356 if (b_new < n)
1357 {
1358 // Save position of LCP-value
1359 lcp_positions_buf[idx_out_buf++] = b_new;
1360 }
1361 index_done[b_new] = true;
1362
1363 // Save interval
1364 q.push(a_new);
1365 q.push(b_new);
1366 ++intervals;
1367 }
1368 }
1369 ++lcp_value;
1370 new_lcp_value = true;
1371
1372 // Calculate LCP positions
1373 while (intervals)
1374 {
1375 if (intervals < use_queue_and_wt && !queue_used)
1376 {
1377 memory_monitor::event("lcp-bwt2-bitvec2queue-begin");
1378 util::clear(dict[target]);
1379
1380 // Copy from bitvector to queue
1381 size_type a2 = util::next_bit(dict[source], 0);
1382 size_type b2 = util::next_bit(dict[source], a2 + 1);
1383 while (b2 < dict[source].size())
1384 {
1385 q.push((a2 - 1) >> 1);
1386 q.push(b2 >> 1);
1387 // Get next interval
1388 a2 = util::next_bit(dict[source], b2 + 1);
1389 b2 = util::next_bit(dict[source], a2 + 1);
1390 }
1391 util::clear(dict[source]);
1392 memory_monitor::event("lcp-bwt2-bitvec2queue-end");
1393 }
1394 if (intervals >= use_queue_and_wt && queue_used)
1395 {
1396 memory_monitor::event("lcp-bwt2-queue2bitvec-begin");
1397 dict[source].resize(2 * (n + 1));
1398 util::set_to_value(dict[source], 0);
1399 // Copy from queue to bitvector
1400 while (!q.empty())
1401 {
1402 dict[source][(q.front() << 1) + 1] = 1;
1403 q.pop();
1404 dict[source][(q.front() << 1)] = 1;
1405 q.pop();
1406 }
1407 dict[target].resize(2 * (n + 1));
1408 util::set_to_value(dict[target], 0);
1409 memory_monitor::event("lcp-bwt2-queue2bitvec-end");
1410 }
1411
1412 if (intervals < use_queue_and_wt)
1413 {
1414 queue_used = true;
1415 intervals_new = 0;
1416 while (intervals)
1417 {
1418 // Get next interval
1419 size_type a = q.front();
1420 q.pop();
1421 size_type b = q.front();
1422 q.pop();
1423 --intervals;
1424
1425 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1426 for (size_type i = 0; i < quantity; ++i)
1427 {
1428 unsigned char c = cs[i];
1429 size_type a_new = C[c] + rank_c_i[i];
1430 size_type b_new = C[c] + rank_c_j[i];
1431
1432 // Save LCP value and corresponding interval if not seen before
1433 if (!index_done[b_new])
1434 {
1435 // Save position of LCP-value
1436 lcp_positions_buf[idx_out_buf++] = b_new;
1437 if (new_lcp_value)
1438 {
1439 // Mark new LCP-value
1440 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1441 new_lcp_value = false;
1442 }
1443 index_done[b_new] = true;
1444
1445 // Save interval
1446 q.push(a_new);
1447 q.push(b_new);
1448 ++intervals_new;
1449 }
1450 }
1451 }
1452 intervals = intervals_new;
1453 }
1454 else
1455 {
1456 queue_used = false;
1457 intervals = 0;
1458 // get next interval
1459 size_type a2 = util::next_bit(dict[source], 0);
1460 size_type b2 = util::next_bit(dict[source], a2 + 1);
1461
1462 while (b2 < dict[source].size())
1463 {
1464 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1465 for (size_type i = 0; i < quantity; ++i)
1466 {
1467 unsigned char c = cs[i];
1468 size_type a_new = C[c] + rank_c_i[i];
1469 size_type b_new = C[c] + rank_c_j[i];
1470 // Save LCP value if not seen before
1471 if (!index_done[b_new])
1472 {
1473 // Save position of LCP-value
1474 lcp_positions_buf[idx_out_buf++] = b_new;
1475 if (new_lcp_value)
1476 {
1477 // Mark new LCP-value
1478 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1479 new_lcp_value = false;
1480 }
1481 index_done[b_new] = true;
1482 // Save interval
1483 dict[target][(a_new << 1) + 1] = 1;
1484 dict[target][(b_new << 1)] = 1;
1485 ++intervals;
1486 }
1487 }
1488 // get next interval
1489 a2 = util::next_bit(dict[source], b2 + 1);
1490 b2 = util::next_bit(dict[source], a2 + 1);
1491 }
1492 std::swap(source, target);
1493 util::set_to_value(dict[target], 0);
1494 }
1495 ++lcp_value;
1496 new_lcp_value = true;
1497 }
1498 memory_monitor::event("lcp-bwt2-calc-values-end");
1499 lcp_positions_buf.close();
1500 }
1501 // (2) Insert LCP entires into LCP array
1502 {
1503 memory_monitor::event("lcp-bwt2-reordering-begin");
1504
1505 int_vector_buffer<> lcp_positions(tmp_lcp_file, std::ios::in, buffer_size);
1506
1507 uint8_t int_width = bits::hi(lcp_value + 1) + 1; // How many bits are needed for one lcp_value?
1508
1509 // Algorithm does r=ceil(int_width/8) runs over LCP-Positions-Array.
1510 // So in each run k>=(n/r) values of the lcp array must be calculated.
1511 // Because k has to be a multiple of 8, we choose number_of_values = (k+16) - ((k+16)%8)
1512 size_type number_of_values = ((n / ((int_width - 1ULL) / 8 + 1) + 16) & (~(0x7ULL)));
1513 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
1514 int_vector_buffer<> lcp_array(lcp_file,
1515 std::ios::out,
1516 number_of_values * int_width / 8,
1517 int_width); // Create Output Buffer
1518 number_of_values = lcp_array.buffersize() * 8 / int_width;
1519
1520 for (size_type position_begin = 0, position_end = number_of_values; position_begin < n and number_of_values > 0;
1521 position_begin = position_end, position_end += number_of_values)
1522 {
1523#ifdef STUDY_INFORMATIONS
1524 std::cout << "# number_of_values=" << number_of_values << " fill lcp_values with " << position_begin
1525 << " <= position <" << position_end << ", each lcp-value has " << (int)int_width
1526 << " bit, lcp_value_max=" << lcp_value << " n=" << n << std::endl;
1527#endif
1528 lcp_value = 0;
1529 for (size_type i = 0; i < n; ++i)
1530 {
1531 size_type position = lcp_positions[i];
1532 if (position > n)
1533 {
1534 position -= n;
1535 ++lcp_value;
1536 }
1537 if (position_begin <= position and position < position_end)
1538 {
1539 lcp_array[position] = lcp_value;
1540 }
1541 }
1542 }
1543 // Close file
1544 lcp_array.close();
1546 lcp_positions.close(true); // close buffer and remove file
1547 memory_monitor::event("lcp-bwt2-reordering-end");
1548 } // End of phase 2
1549}
1550
1551} // namespace sdsl
1552
1553#endif
bits.hpp contains the sdsl::bits class.
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
uint8_t width() const
Returns the width of the integers which are accessed via the [] operator.
int_vector< 64 >::size_type size() const noexcept
void shrink_to_fit()
Free unused allocated memory.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
size_type size() const noexcept
The number of elements in the int_vector.
reference front() noexcept
Returns first element.
void resize(const size_type size)
Resize the int_vector in terms of elements.
static mm_event_proxy event(std::string const &name)
A rank structure proposed by Sebastiano Vigna.
size_type rank(size_type idx) const
Answers rank queries for the supported bit_vector.
construct_isa.hpp contains a space and time efficient construction method for the inverse suffix arra...
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
memory_tracking.hpp contains two function for allocating and deallocating memory
constexpr char KEY_SA[]
Definition config.hpp:36
constexpr char KEY_TEXT[]
Definition config.hpp:40
constexpr char KEY_LCP[]
Definition config.hpp:43
constexpr char KEY_ISA[]
Definition config.hpp:39
constexpr char KEY_BWT[]
Definition config.hpp:34
Get the smallest position f$i geq idx f$ where a bit is set t_int_vec::size_type next_bit(t_int_vec const &v, uint64_t idx)
void set_to_value(t_int_vec &v, uint64_t k)
Set all entries of int_vector to value k.
Definition util.hpp:597
Namespace for the succinct data structure library.
bool store_to_cache(T const &v, std::string const &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition io.hpp:804
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void construct_lcp_kasai(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct_lcp_bwt_based(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
std::string cache_file_name(std::string const &key, cache_config const &config)
Returns the file name of the resource.
Definition io.hpp:688
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
void construct_lcp_PHI(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
char const * key_text_trait_impl< 0, T >::KEY_TEXT
Definition config.hpp:132
void register_cache_file(std::string const &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition io.hpp:717
T::size_type size_in_bytes(T const &t)
Get the size of a data structure in bytes.
Definition io.hpp:854
void create_C_array(std::vector< uint64_t > &C, tWT const &wt)
bool load_from_cache(T &v, std::string const &key, cache_config const &config, bool add_type_hash=false)
Definition io.hpp:772
int_vector< 1 > bit_vector
bit_vector is a specialization of the int_vector.
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition construct.hpp:56
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_lcp_goPHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_isa(cache_config &config)
void construct_lcp_bwt_based2(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void interval_symbols(t_wt const &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
bool store_to_file(T const &v, std::string const &file)
Store a data structure to a file.
Definition io.hpp:874
int_vector ::size_type size(range_type const &r)
Size of a range.
void construct_lcp_go(cache_config &config)
Construct the LCP array (only for byte strings)
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
rank_support_v.hpp contains rank_support_v.
select_support_scan.hpp contains classes that support a sdsl::bit_vector with linear time select.
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition bits.hpp:194
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:653
static constexpr uint64_t lo_unset[65]
lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
Definition bits.hpp:216
Helper class for construction process.
Definition config.hpp:66
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.