64 std::string & filename_sa,
70 uint64_t buffersize = 1024 * 1024 / 8;
73 size_t number_of_lms_strings = 0;
77 std::vector<uint64_t> bkt(sigma, 0);
80 for (
size_t i = 0; i < n; ++i)
82 ++bkt[text[text_offset + i]];
87 for (
size_t c = 0; c < sigma; ++c)
94 for (
size_t c = 1; c < sigma; ++c)
96 bkt[c] = bkt[c - 1] + bkt[c];
100 for (
size_t i = n - 2, was_s_typ = 1; i < n; --i)
102 if (text[text_offset + i] > text[text_offset + i + 1])
106 sa[bkt[text[text_offset + i + 1]]--] = i + 1;
107 ++number_of_lms_strings;
111 else if (text[text_offset + i] < text[text_offset + i + 1])
119 for (
size_t c = 1; c < sigma; ++c)
121 bkt[c] = bkt[c - 1] + c_array[c - 1];
125 for (
size_t i = 0; i < n; ++i)
127 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
129 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
136 for (
size_t c = 1; c < sigma; ++c)
138 bkt[c] = bkt[c - 1] + c_array[c];
142 for (
size_t i = n - 1, endpointer = n; i < n; --i)
146 if (text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
148 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
152 sa[--endpointer] = sa[i];
159 for (
size_t i = n - 2, end = n - 2, was_s_typ = 1; i < n; --i)
161 if (text[text_offset + i] > text[text_offset + i + 1])
165 sa[(i + 1) >> 1] = end - i;
170 else if (text[text_offset + i] < text[text_offset + i + 1])
177 for (
size_t i = n - number_of_lms_strings + 1, cur_pos = 0, cur_len = 0, last_pos = n - 1, last_len = 1; i < n;
181 cur_len = sa[(cur_pos >> 1)];
182 if (cur_len == last_len)
185 while (l < cur_len and text[text_offset + cur_pos + l] == text[text_offset + last_pos + l])
194 sa[(cur_pos >> 1)] = ++name;
201 if (name + 1 < number_of_lms_strings)
204 for (
size_t i = 0, t = n - number_of_lms_strings; i < (n >> 1); ++i)
219 number_of_lms_strings,
220 n - number_of_lms_strings,
224 for (
size_t i = n - 2, endpointer = n - 1, was_s_typ = 1; i < n; --i)
226 if (text[text_offset + i] > text[text_offset + i + 1])
230 sa[endpointer--] = i + 1;
234 else if (text[text_offset + i] < text[text_offset + i + 1])
241 for (
size_t i = 0; i < number_of_lms_strings; ++i)
244 sa[i] = sa[n - number_of_lms_strings + pos];
245 sa[n - number_of_lms_strings + pos] = 0;
252 for (
size_t i = 1; i < number_of_lms_strings; ++i)
254 sa[i] = sa[n - number_of_lms_strings + i];
255 sa[n - number_of_lms_strings + i] = 0;
258 for (
size_t i = number_of_lms_strings; i < (n >> 1); ++i)
270 std::vector<uint64_t> bkt(sigma, 0);
271 for (
size_t c = 1; c < sigma; ++c)
273 bkt[c] = bkt[c - 1] + c_array[c];
277 for (
size_t i = number_of_lms_strings - 1; i < n; --i)
281 sa[bkt[text[text_offset + pos]]--] = pos;
286 for (
size_t c = 1; c < sigma; ++c)
288 bkt[c] = bkt[c - 1] + c_array[c - 1];
292 for (
size_t i = 0; i < n; ++i)
294 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
296 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
302 for (
size_t c = 1; c < sigma; ++c)
304 bkt[c] = bkt[c - 1] + c_array[c];
306 for (
size_t i = n - 1; i < n; --i)
308 if (sa[i] > 0 and text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
310 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
318void _construct_sa_se(int_vector_type & text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
322 uint64_t n = text.size();
324 uint8_t int_width =
bits::hi(n - 1) + 1;
325 uint64_t buffersize = 1024 * 1024 / 8;
330 size_t first_lms_pos = 0;
331 size_t number_of_lms_strings = 0;
332 size_t bkt_s_last = 0, bkt_s_sum = 0, bound_s = 0, bkt_l_sum = 0;
342 uint64_t ci = text[n - 1];
345 for (
size_t i = n - 2; i < n; --i)
352 ++bkt_s[text[i + 1]];
356 lms_pos_b[i + 1] = 1;
357 ++number_of_lms_strings;
358 first_lms_pos = i + 1;
371 bkt_l[0] = C[0] - bkt_s[0];
372 for (
size_t i = 1; i < C.
size(); ++i)
374 bkt_l[i] = C[i] - bkt_s[i];
375 C[i] = C[i] + C[i - 1];
385 size_t right_pointer = 0;
390 size_t left_pointer = 0;
392 for (
size_t i = 0, tmp2 = 0, tmp = 0; i < sigma; ++i)
402 for (
size_t i = n - 2, was_s_typ = 1, ci = text[n - 1]; i < n; --i)
424 int_vector<> lms_strings(number_of_lms_strings, 0, int_width);
425 for (
size_t i = 0; i < lms_positions.
size();)
427 size_t idx = lms_positions[i++];
428 size_t val = lms_positions[i++];
429 lms_strings[idx] = val;
431 lms_positions.
close(
true);
434 for (
size_t i = 0; i < number_of_lms_strings; ++i)
436 left[left_pointer++] = lms_strings[number_of_lms_strings - i - 1];
446 for (
size_t i = 0, tmp = 0; i < sigma; ++i)
449 bkt_l[i] = bkt_l_sum;
451 bkt_lms[i] = bkt_l[i];
454 size_t partsize = bkt_l_sum / parts + 1;
457 std::vector<int_vector_buffer<>> cached_array(parts - 1);
458 for (
size_t i = 0; i < cached_array.size(); ++i)
467 for (
size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
470 for (; pos < bkt_l[c]; ++pos)
473 if (pos - offset >= partsize)
476 for (
size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
478 size_t src = cached_array[cur_part][i++];
479 size_t val = cached_array[cur_part][i++];
480 array[src - offset] = val;
482 cached_array[pos / partsize - 1].reset();
485 size_t idx = array[pos - offset];
488 right[right_pointer++] = idx;
492 size_t symbol = text[idx - 1];
495 size_t val = idx - 1;
496 size_t src = bkt_l[symbol];
497 bkt_l[symbol] = bkt_l[symbol] + 1;
498 if ((src - offset) / partsize == 0)
500 array[src - offset] = val;
504 size_t part = src / partsize - 1;
505 cached_array[part].push_back(src);
506 cached_array[part].push_back(val);
511 right[right_pointer++] = idx;
516 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
518 size_t idx = left[left_pointer--];
520 size_t symbol = text[idx];
523 size_t src = bkt_l[symbol];
524 bkt_l[symbol] = bkt_l[symbol] + 1;
525 if ((src - offset) / partsize == 0)
527 array[src - offset] = val;
531 size_t part = src / partsize - 1;
532 cached_array[part].push_back(src);
533 cached_array[part].push_back(val);
537 for (
size_t i = 0; i < cached_array.size(); ++i)
539 cached_array[i].close(
true);
543 for (
size_t i = 0; i < sigma; ++i)
545 bkt_l[i] = bkt_lms[i];
555 bkt_s_last = 0, bkt_s_sum = 0;
556 for (
size_t i = 0; i < sigma; ++i)
558 bkt_s_sum += bkt_s[i];
561 bkt_s[i] = bkt_s_sum;
562 bkt_s_last = bkt_s_sum;
566 bkt_s[i] = bkt_s_sum;
568 bkt_lms[i] = bkt_s[i];
573 for (
size_t i = 0; i < sigma; ++i)
575 if (bkt_s[i] > bkt_s_sum / 2)
577 bkt_s_sum = bkt_s[i];
582 size_t partsize = bound_s / parts + 1;
584 std::vector<int_vector_buffer<>> cached_array(parts - 1);
585 for (
size_t i = 0; i < cached_array.size(); ++i)
593 for (
size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
596 for (; pos + 1 > bkt_s[c]; --pos)
602 for (
size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
604 size_t src = cached_array[cur_part][i++];
605 size_t val = cached_array[cur_part][i++];
606 array[src - offset] = val;
608 cached_array[offset / partsize].reset();
611 size_t idx = array[pos - offset];
617 size_t symbol = text[idx];
620 bkt_s[symbol] = bkt_s[symbol] - 1;
622 size_t src = bkt_s[symbol];
625 array[src - offset] = val;
629 size_t part = src / partsize;
630 cached_array[part].push_back(src);
631 cached_array[part].push_back(val);
636 left[left_pointer++] = array[pos - offset];
641 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
643 size_t idx = right[right_pointer--];
649 size_t symbol = text[idx];
650 bkt_s[symbol] = bkt_s[symbol] - 1;
653 size_t src = bkt_s[symbol];
656 array[src - offset] = val;
660 size_t part = src / partsize;
661 cached_array[part].push_back(src);
662 cached_array[part].push_back(val);
666 for (
size_t i = 0; i < cached_array.size(); ++i)
668 cached_array[i].close(
true);
671 for (
size_t i = 0; i < sigma; ++i)
673 bkt_s[i] = bkt_lms[i];
683 size_t last_end_pos = first_lms_pos, order = number_of_lms_strings - 1;
684 same_lms[number_of_lms_strings - 1] =
true;
685 for (
size_t i = number_of_lms_strings - 2, a = 0, b = 0, last_a = left[number_of_lms_strings - 1];
686 i < number_of_lms_strings;
694 if (end_pos - a == last_end_pos - b)
696 while (a < end_pos and text[a] == text[b])
701 if (text[a] == text[b])
707 last_end_pos = end_pos;
721 text_rec.
resize(number_of_lms_strings);
724 if (recursion == 0 and n / 2 * text_rec.
width() > 8 * n)
726 size_t size_of_part = n / 4 + 3;
727 text_rec.
resize(size_of_part);
730 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
736 if (left[i] / 2 >= size_of_part)
738 text_rec[(left[i] / 2) - size_of_part] = order;
743 for (
size_t i = 0; i < size_of_part; ++i)
747 text_rec[pos++] = text_rec[i];
752 text_rec.
resize(size_of_part);
755 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
761 if (left[i] / 2 < size_of_part)
763 text_rec[left[i] / 2] = order;
767 for (
size_t i = 0; i < size_of_part; ++i)
771 text_rec[pos++] = text_rec[i];
774 text_rec.
resize(number_of_lms_strings);
776 for (
size_t i = 0; i < buf.
size(); ++i)
778 text_rec[pos++] = buf[i];
781 text_rec[number_of_lms_strings - 1] = 0;
785 text_rec.
resize(n / 2 + 1);
788 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
794 text_rec[left[left_pointer--] / 2] = order;
796 for (
size_t i = 0, pos = 0; i < text_rec.
size(); ++i)
800 text_rec[pos++] = text_rec[i];
803 text_rec[number_of_lms_strings - 1] = 0;
804 text_rec.
resize(number_of_lms_strings);
807 util::clear(same_lms);
814 if (text_rec.
size() > order + 1)
825 for (
size_t i = 0; i < number_of_lms_strings; ++i)
827 text_rec[number_of_lms_strings + i] = text_rec[i];
834 number_of_lms_strings,
835 number_of_lms_strings,
840 text_rec.
resize(number_of_lms_strings);
846 isa_rec = std::move(text_rec);
850 if (isa_rec.
size() > 0)
859 util::init_support(lms_select_support, &lms_pos_b);
861 int_vector<> tmp_left(number_of_lms_strings, 0, int_width);
862 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
864 size_t idx = isa_rec[i];
865 size_t val = lms_select_support.
select(i + 1);
869 util::clear(lms_select_support);
870 util::clear(lms_pos_b);
871 util::clear(isa_rec);
875 for (; left_pointer < number_of_lms_strings; ++left_pointer)
877 left[left_pointer] = tmp_left[number_of_lms_strings - left_pointer - 1];
880 util::clear(tmp_left);
892 util::init_support(lms_select_support, &lms_pos_b);
895 for (uint64_t i = 0; i < sa_rec_buf.
size(); ++i)
897 uint64_t pos = lms_select_support.
select(sa_rec_buf[i] + 1);
898 left[number_of_lms_strings - 1 - left_pointer++] = pos;
900 sa_rec_buf.
close(
true);
914 size_t sa_pointer = 0;
916 size_t partsize = bkt_l_sum / parts + 1;
918 std::vector<int_vector_buffer<>> cached_array(parts - 1);
919 for (
size_t i = 0; i < cached_array.size(); ++i)
928 for (
size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
931 for (; pos < bkt_l[c]; ++pos)
934 if (pos - offset >= partsize)
937 for (
size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
939 size_t src = cached_array[cur_part][i++];
940 size_t val = cached_array[cur_part][i++];
941 array[src - offset] = val;
943 cached_array[pos / partsize - 1].reset();
945 size_t idx = array[pos - offset];
948 cached_sa[sa_pointer++] = idx;
949 right[right_pointer++] = idx;
953 size_t symbol = text[idx - 1];
954 cached_sa[sa_pointer++] = idx;
957 size_t val = idx - 1;
958 size_t src = bkt_l[symbol];
959 bkt_l[symbol] = bkt_l[symbol] + 1;
960 if ((src - offset) / partsize == 0)
962 array[src - offset] = val;
966 size_t part = src / partsize - 1;
967 cached_array[part].push_back(src);
968 cached_array[part].push_back(val);
973 right[right_pointer++] = idx;
979 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
981 size_t idx = left[left_pointer--];
987 size_t symbol = text[idx];
989 size_t src = bkt_l[symbol];
990 bkt_l[symbol] = bkt_l[symbol] + 1;
991 if ((src - offset) / partsize == 0)
993 array[src - offset] = val;
997 size_t part = src / partsize - 1;
998 cached_array[part].push_back(src);
999 cached_array[part].push_back(val);
1003 for (
size_t i = 0; i < cached_array.size(); ++i)
1005 cached_array[i].close(
true);
1013 size_t partsize = bound_s / parts + 1;
1016 std::vector<int_vector_buffer<>> cached_array(parts - 1);
1017 for (
size_t i = 0; i < cached_array.size(); ++i)
1026 for (
size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
1029 assert(c < C.
size());
1030 sa_pointer = C[c] - 1;
1031 for (; pos + 1 > bkt_s[c]; --pos)
1033 while (pos < offset)
1037 for (
size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
1039 size_t src = cached_array[cur_part][i++];
1040 size_t val = cached_array[cur_part][i++];
1041 assert((src - offset) < array.
size());
1042 array[src - offset] = val;
1044 assert((offset / partsize) < parts - 1);
1045 cached_array[offset / partsize].reset();
1048 assert((pos - offset) < array.
size());
1049 size_t idx = array[pos - offset];
1055 assert((idx) < text.size());
1056 size_t symbol = text[idx];
1061 cached_sa[sa_pointer--] = 0;
1065 cached_sa[sa_pointer--] = idx + 1;
1067 assert((symbol) < bkt_s.size());
1068 bkt_s[symbol] = bkt_s[symbol] - 1;
1070 size_t src = bkt_s[symbol];
1073 assert((src - offset) < array.
size());
1074 array[src - offset] = val;
1078 size_t part = src / partsize;
1079 assert(part < parts - 1);
1080 cached_array[part].push_back(src);
1081 cached_array[part].push_back(val);
1088 cached_sa[sa_pointer--] = 0;
1092 cached_sa[sa_pointer--] = idx + 1;
1097 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
1099 size_t idx = right[right_pointer--];
1105 size_t symbol = text[idx];
1106 assert((symbol) < bkt_s.size());
1107 bkt_s[symbol] = bkt_s[symbol] - 1;
1110 size_t src = bkt_s[symbol];
1113 assert((src - offset) < array.
size());
1114 array[src - offset] = val;
1118 size_t part = src / partsize;
1119 assert((part) < parts - 1);
1120 cached_array[part].push_back(src);
1121 cached_array[part].push_back(val);
1125 for (
size_t i = 0; i < cached_array.size(); ++i)
1127 cached_array[i].close(
true);