SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
qsufsort.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
4/* qsufsort.c
5 * Copyright 1999, N. Jesper Larsson, all rights reserved.
6 *
7 * This file contains an implementation of the algorithm presented in "Faster
8 * Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9 * Sadakane (sada@is.s.u-tokyo.ac.jp).
10 *
11 * This software may be used freely for any purpose. However, when distributed,
12 * the original source must be clearly stated, and, when the source code is
13 * distributed, the copyright notice must be retained and any alterations in
14 * the code must be clearly marked. No warranty is given regarding the quality
15 of this software.*/
22
23#ifndef INCLUDED_SDSL_QSUFSORT
24#define INCLUDED_SDSL_QSUFSORT
25
26#define DBG_OUT \
27 if (0) \
28 std::cout
29
30#include <algorithm>
31#include <assert.h>
32#include <iostream>
33#include <memory>
34#include <stdexcept>
35#include <stdint.h>
36#include <typeinfo>
37
38#include <sdsl/bits.hpp>
39#include <sdsl/int_vector.hpp>
40#include <sdsl/io.hpp>
41#include <sdsl/util.hpp>
42
43namespace sdsl
44{
45namespace qsufsort
46{
47
48template <class int_vector_type = int_vector<>>
49class sorter;
50
51// void sort(int_iter text, int_iter sa, int64_t n, int64_t k, int64_t l);
52
54
70// TODO: problem when int_width==64!!!
71template <class int_vector_type>
72void construct_sa(int_vector_type & sa, char const * file, uint8_t num_bytes)
73{
75 s.sort(sa, file, num_bytes);
76}
77
78template <class int_vector_type, class t_vec>
79void construct_sa(int_vector_type & sa, t_vec & text)
80{
82 s.sort(sa, text);
83}
84
85template <class int_vector_type>
86class sorter
87{
88 typedef int_vector_type tIV;
89 typedef typename tIV::iterator int_iter;
90 typedef typename tIV::size_type size_type;
91
92private:
93 int_iter m_SA, // group array, ultimately suffix array.
94 m_VV; // inverse array, ultimately inverse of SA.
95 uint64_t m_rr, // number of symbols aggregated by transform.
96 m_hh; // length of already-sorted prefixes.
97 uint8_t m_msb; // most significant bit position starting from 0
98 uint64_t m_msb_mask; // mask for 1ULL<<msb
99
100 inline int64_t to_sign(uint64_t x) const
101 {
102 return x & m_msb_mask ? -((int64_t)(x & ~m_msb_mask)) : x;
103 }
104 // return the absolute value of integer x
105 inline int64_t mark_pos(uint64_t x) const
106 {
107 return (x & ~m_msb_mask);
108 }
109 // mark the number x as negative
110 inline int64_t mark_neg(uint64_t x) const
111 {
112 return x | m_msb_mask;
113 }
114 // check if x is not negative
115 inline bool not_neg(uint64_t x) const
116 {
117 return !(x >> m_msb);
118 }
119 // check if x is negative
120 inline bool is_neg(uint64_t x) const
121 {
122 return x & m_msb_mask;
123 }
124 // returns the key of iterator p at the current sorting depth
125 inline uint64_t key(int_iter const & p) const
126 {
127 return m_VV[*p + m_hh];
128 }
129 // swap the value of two iterators
130 inline void swap(int_iter & p, int_iter & q) const
131 {
132 uint64_t tmp = *p;
133 *p = *q;
134 *q = tmp;
135 }
136 // select the median out of 3 elements
137 inline int_iter const & med3(int_iter const & a, int_iter const & b, int_iter const & c) const
138 {
139 return key(a) < key(b) ? (key(b) < key(c) ? b : (key(a) < key(c) ? c : a))
140 : (key(b) > key(c) ? b : (key(a) > key(c) ? c : a));
141 }
142
143 /* Subroutine for select_sort_split and sort_split. Sets group numbers for a
144 group whose lowest position in m_SA is pl and highest position is pm.*/
145 void update_group(int_iter pl, int_iter pm)
146 {
147 int64_t g = pm - m_SA; /* group number.*/
148 m_VV[*pl] = g; /* update group number of first position.*/
149 if (pl == pm)
150 *pl = mark_neg(1); /* one element, sorted group.*/
151 else
152 do /* more than one element, unsorted group.*/
153 m_VV[*++pl] = g; /* update group numbers.*/
154 while (pl < pm);
155 }
156
157 /* Quadratic sorting method to use for small subarrays. To be able to update
158 group numbers consistently, a variant of selection sorting is used.*/
159 void select_sort_split(int_iter const & p, int64_t n)
160 {
161 int_iter pa, pb, pi, pn;
162 uint64_t f, v;
163
164 pa = p; /* pa is start of group being picked out.*/
165 pn = p + n - 1; /* pn is last position of subarray.*/
166 while (pa < pn)
167 {
168 for (pi = pb = (pa + 1), f = key(pa); pi <= pn; ++pi)
169 if ((v = key(pi)) < f)
170 {
171 f = v; /* f is smallest key found.*/
172 swap(pi, pa); /* place smallest element at beginning.*/
173 pb = pa + 1; /* pb is position for elements equal to f.*/
174 }
175 else if (v == f)
176 { /* if equal to smallest key.*/
177 swap(pi, pb); /* place next to other smallest elements.*/
178 ++pb;
179 }
180 update_group(pa, pb - 1); /* update group values for new group.*/
181 pa = pb; /* continue sorting rest of the subarray.*/
182 }
183 if (pa == pn)
184 { /* check if last part is single element.*/
185 m_VV[*pa] = pa - m_SA;
186 *pa = mark_neg(1); /* sorted group.*/
187 }
188 }
189
190 /* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
191 uint64_t choose_pivot(int_iter const & p, int64_t n)
192 {
193 int_iter pl, pm, pn;
194 int64_t s;
195
196 pm = p + (n >> 1); /* small arrays, middle element.*/
197 if (n > 7LL)
198 {
199 pl = p;
200 pn = p + n - 1;
201 if (n > 40LL)
202 { /* big arrays, pseudomedian of 9.*/
203 s = n >> 3;
204 pl = med3(pl, pl + s, pl + s + s);
205 pm = med3(pm - s, pm, pm + s);
206 pn = med3(pn - s - s, pn - s, pn);
207 }
208 pm = med3(pl, pm, pn); /* midsize arrays, median of 3.*/
209 }
210 return key(pm);
211 }
212
213 /* Sorting routine called for each unsorted group. Sorts the array of integers
214 * (suffix numbers) of length n starting at p. The algorithm is a ternary-split
215 * quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
216 * Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
217 function is based on Program 7.*/
218 void sort_split(int_iter const & p, int64_t n)
219 {
220 int_iter pa, pb, pc, pd, pl, pm, pn;
221 uint64_t f, v;
222 int64_t s, t;
223
224 if (n < 7)
225 { /* multi-selection sort smallest arrays.*/
226 select_sort_split(p, n);
227 return;
228 }
229
230 v = choose_pivot(p, n);
231 pa = pb = p; // pa: iterator for equal elements
232 pc = pd = p + n - 1; // pc = right border (inclusive)
233 while (1)
234 { /* split-end partition.*/
235 while (pb <= pc && (f = key(pb)) <= v)
236 { // go to the right as long as there are keys smaller equal than v
237 if (f == v)
238 {
239 swap(pa, pb);
240 ++pa; // swap equal chars to the left
241 }
242 ++pb;
243 }
244 while (pc >= pb && (f = key(pc)) >= v)
245 { // go to the left as long as there are keys bigger or equal to v
246 if (f == v)
247 {
248 swap(pc, pd);
249 --pd; // swap equal chars to the right end
250 }
251 --pc;
252 }
253 if (pb > pc)
254 break;
255 swap(pb,
256 pc); // swap element > v (pb) to the third part and element < v (pc) to the second
257 ++pb;
258 --pc;
259 }
260 pn = p + n;
261 if ((s = pa - p) > (t = pb - pa))
262 s = t;
263 for (pl = p, pm = pb - s; s; --s, ++pl, ++pm)
264 swap(pl, pm);
265 if ((s = pd - pc) > (t = pn - pd - 1))
266 s = t;
267 for (pl = pb, pm = pn - s; s; --s, ++pl, ++pm)
268 swap(pl, pm);
269 s = pb - pa;
270 t = pd - pc;
271 if (pa > pb)
272 {
273 if (s > 0)
274 {
275 std::cout << "s=" << s << ">0 but should be <0; n=" << n << std::endl;
276 }
277 }
278 if (pc > pd)
279 {
280 if (t > 0)
281 {
282 std::cout << "t=" << t << ">0 but should be <0; n=" << n << std::endl;
283 }
284 }
285 if (s > 0)
286 sort_split(p, s);
287 update_group(p + s, p + n - t - 1);
288 if (t > 0)
289 sort_split(p + n - t, t);
290 }
291
292 /* Bucketsort for first iteration.
293 *
294 * Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
295 * at least once. x[n] is 0. (This is the corresponding output of transform.) k
296 * must be at most n+1. p is array of size n+1 whose contents are disregarded.
297 *
298 * Output: x is m_VV and p is m_SA after the initial sorting stage of the refined
299 suffix sorting algorithm.*/
300 void bucketsort(int_iter const & x, int_iter const & p, int64_t n, int64_t k)
301 {
302 int_iter pi;
303 int64_t i, d, g;
304 uint64_t c;
305
306 for (pi = p; pi < p + k; ++pi)
307 *pi = mark_neg(1); /* mark linked lists empty.*/
308 for (i = 0; i <= n; ++i)
309 {
310 x[i] = p[c = x[i]]; /* insert in linked list.*/
311 p[c] = i;
312 }
313 for (pi = p + k - 1, i = n; pi >= p; --pi)
314 {
315 d = x[c = *pi]; /* c is position, d is next in list.*/
316 x[c] = g = i; /* last position equals group number.*/
317 if (not_neg(d))
318 { /* if more than one element in group.*/
319 p[i--] = c; /* p is permutation for the sorted x.*/
320 do
321 {
322 d = x[c = d]; /* next in linked list.*/
323 x[c] = g; /* group number in x.*/
324 p[i--] = c; /* permutation in p.*/
325 }
326 while (not_neg(d));
327 }
328 else
329 p[i--] = mark_neg(1); /* one element, sorted group.*/
330 }
331 }
332
333public:
334 /* Transforms the alphabet of x by attempting to aggregate several symbols into
335 * one, while preserving the suffix order of x. The alphabet may also be
336 * compacted, so that x on output comprises all integers of the new alphabet
337 * with no skipped numbers.
338 *
339 * Input: x is an array of size n+1 whose first n elements are positive
340 * integers in the range l...k-1. p is array of size n+1, used for temporary
341 * storage. q controls aggregation and compaction by defining the maximum value
342 * for any symbol during transformation: q must be at least k-l; if q<=n,
343 * compaction is guaranteed; if k-l>n, compaction is never done; if q is
344 * INT_MAX, the maximum number of symbols are aggregated into one.
345 *
346 * Output: Returns an integer j in the range 1...q representing the size of the
347 * new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
348 set to the number of old symbols grouped into one. Only x[n] is 0.*/
349
350 int64_t transform(int_iter const & x, int_iter const & p, int64_t n, int64_t k, int64_t l, int64_t q)
351 {
352 if (!(q >= k - l))
353 {
354 std::cout << "q=" << q << " k-l=" << k - l << std::endl;
355 }
356 assert(q >= k - l);
357 DBG_OUT << "transform(n=" << n << ", k=" << k << ", l=" << l << ", q=" << q << ")" << std::endl;
358 uint64_t bb, cc, dd;
359 int64_t jj;
360 int_iter pi, pj;
361 int s = bits::hi(k - l) + (k > l); /* s is number of bits in old symbol.*/
362 uint8_t len = 0; /* len is for overflow checking.*/
363 m_rr = 0;
364 for (bb = dd = 0; (int)m_rr < n && (int)len < m_msb + 1 - s && (int64_t)(cc = dd << s | (k - l)) <= q;
365 ++m_rr, len += s)
366 {
367 bb = bb << s | (x[m_rr] - l + 1); /* bb is start of x in chunk alphabet.*/
368 dd = cc; /* dd is max symbol in chunk alphabet.*/
369 }
370 DBG_OUT << "m_rr=" << m_rr << std::endl;
371 uint64_t mm = (1ULL << (m_rr - 1) * s) - 1; /* mm masks off top old symbol from chunk.*/
372 x[n] = l - 1; /* emulate zero terminator.*/
373 if ((int64_t)dd <= n)
374 { /* if bucketing possible, compact alphabet.*/
375 for (pi = p; pi <= p + dd; ++pi)
376 *pi = 0; /* zero transformation table.*/
377 for (pi = x + m_rr, cc = bb; pi <= x + n; ++pi)
378 {
379 p[cc] = 1; /* mark used chunk symbol.*/
380 cc = (cc & mm) << s | (*pi - l + 1); /* shift in next old symbol in chunk.*/
381 }
382 for (uint64_t i = 1; i < m_rr; ++i)
383 { /* handle last r-1 positions.*/
384 p[cc] = 1; /* mark used chunk symbol.*/
385 cc = (cc & mm) << s; /* shift in next old symbol in chunk.*/
386 }
387 for (pi = p, jj = 1; pi <= p + dd; ++pi)
388 if (*pi)
389 *pi = jj++; /* j is new alphabet size.*/
390 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
391 {
392 *pi = p[cc]; /* transform to new alphabet.*/
393 cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
394 }
395 while (pi < x + n)
396 { /* handle last r-1 positions.*/
397 *pi++ = p[cc]; /* transform to new alphabet.*/
398 cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
399 }
400 }
401 else
402 { /* bucketing not possible, don't compact.*/
403 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
404 {
405 *pi = cc; /* transform to new alphabet.*/
406 cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
407 }
408 while (pi < x + n)
409 { /* handle last r-1 positions.*/
410 *pi++ = cc; /* transform to new alphabet.*/
411 cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
412 }
413 jj = dd + 1; /* new alphabet size.*/
414 }
415 x[n] = 0; /* end-of-string symbol is zero.*/
416 DBG_OUT << "end transformation jj=" << jj << std::endl;
417 return jj; /* return new alphabet size.*/
418 }
419
420 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
421 * n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
422 * contents of x[n] is disregarded, the n-th symbol being regarded as
423 end-of-string smaller than all other symbols.*/
424 void sort(int_iter const & x, int_iter const & p, int64_t n, int64_t k, int64_t l)
425 {
426 int_iter pi, pk;
427 m_VV = x; /* set global values.*/
428 m_SA = p;
429 if (n >= k - l)
430 { /* if bucketing possible,*/
431 int64_t j = transform(m_VV, m_SA, n, k, l, n);
432 DBG_OUT << "begin bucketsort j=" << j << std::endl;
433 bucketsort(m_VV, m_SA, n, j); /* bucketsort on first r positions.*/
434 DBG_OUT << "end bucketsort" << std::endl;
435 }
436 else
437 {
438 transform(m_VV, m_SA, n, k, l, m_msb_mask - 1);
439 DBG_OUT << "initialize SA begin" << std::endl;
440 for (int64_t i = 0; i <= n; ++i)
441 m_SA[i] = i; /* initialize I with suffix numbers.*/
442 DBG_OUT << "initialize SA end" << std::endl;
443 m_hh = 0;
444 sort_split(m_SA, n + 1); /* quicksort on first r positions.*/
445 }
446 m_hh = m_rr; /* number of symbols aggregated by transform.*/
447 // while ( is_neg(*m_SA) and mark_pos(*m_SA) <= n) {
448 while (to_sign(*m_SA) >= -n)
449 {
450 // std::cout<<"m_hh="<<m_hh<<std::endl;
451 DBG_OUT << "SA = ";
452 // for(size_t iii=0; iii<=(size_t)n; ++iii){
453 // uint64_t D = *(m_SA+iii);
454 // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
455 //}
456 DBG_OUT << std::endl;
457 DBG_OUT << "TEXT = ";
458 // for(size_t iii=0; iii<=(size_t)n; ++iii){
459 // uint64_t D = *(m_VV+iii);
460 // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
461 //}
462 DBG_OUT << std::endl;
463 DBG_OUT << "*m_SA=" << to_sign(*m_SA) << std::endl;
464 // std::cout<<"mark_pos(*m_SA)="<<mark_pos(*m_SA)<<std::endl;
465 pi = m_SA; /* pi is first position of group.*/
466 int64_t sl = 0; /* sl is length of sorted groups.*/
467 DBG_OUT << "m_hh=" << m_hh << std::endl;
468 do
469 {
470 uint64_t s = *pi;
471 if (to_sign(s) < (int64_t)0)
472 {
473 pi += mark_pos(s); /* skip over sorted group.*/
474 sl += mark_pos(s); /* add length to sl.*/
475 }
476 else
477 {
478 if (sl)
479 {
480 *(pi - sl) = mark_neg(sl); /* combine sorted groups before pi.*/
481 sl = 0;
482 }
483 pk = m_SA + m_VV[s] + 1; /* pk-1 is last position of unsorted group.*/
484 sort_split(pi, pk - pi);
485 pi = pk; /* next group.*/
486 }
487 }
488 while ((pi - m_SA) <= n);
489 if (sl) /* if the array ends with a sorted group.*/
490 *(pi - sl) = mark_neg(sl); /* combine sorted groups at end of m_SA.*/
491 m_hh = 2 * m_hh; /* double sorted-depth.*/
492 DBG_OUT << "m_hh=" << m_hh << std::endl;
493 }
494 for (int64_t i = 0; i <= n; ++i)
495 { /* reconstruct suffix array from inverse.*/
496 m_SA[m_VV[i]] = i;
497 }
498 }
499
500 void do_sort(tIV & sa, tIV & x)
501 {
502 assert(x.size() > 0);
503 DBG_OUT << "x.width()=" << (int)x.width() << std::endl;
504 DBG_OUT << "x.size()=" << x.size() << std::endl;
505 DBG_OUT << "sa.width()=" << (int)sa.width() << std::endl;
506 DBG_OUT << "sa.size()=" << sa.size() << std::endl;
507 if (x.size() == 1)
508 {
509 sa = tIV(1, 0);
510 return;
511 }
512
513 int64_t max_symbol = 0, min_symbol = x.width() < 64 ? bits::lo_set[x.width()] : 0x7FFFFFFFFFFFFFFFLL;
514
515 for (size_type i = 0; i < x.size() - 1; ++i)
516 {
517 max_symbol = std::max(max_symbol, (int64_t)x[i]);
518 min_symbol = std::min(min_symbol, (int64_t)x[i]);
519 }
520
521 if (0 == min_symbol)
522 {
523 throw std::logic_error("Text contains 0-symbol. Suffix array can not be constructed.");
524 }
525 if (x[x.size() - 1] > 0)
526 {
527 throw std::logic_error("Last symbol is not 0-symbol. Suffix array can not be constructed.");
528 }
529 DBG_OUT << "sorter: min_symbol=" << min_symbol << std::endl;
530 DBG_OUT << "sorter: max_symbol=" << max_symbol << std::endl;
531
532 int64_t n = x.size() - 1;
533 DBG_OUT << "x.size()-1=" << x.size() - 1 << " n=" << n << std::endl;
534 uint8_t width = std::max(bits::hi(max_symbol) + 2, bits::hi(n + 1) + 2);
535 DBG_OUT << "sorter: width=" << (int)width << " max_symbol_width=" << bits::hi(max_symbol) + 1
536 << " n_width=" << bits::hi(n) << std::endl;
537 util::expand_width(x, width);
538 sa = x;
539 if (sa.width() < x.width())
540 {
541 throw std::logic_error("Fixed size suffix array is to small for the specified text.");
542 return;
543 }
544
545 m_msb = sa.width() - 1;
546 m_msb_mask = 1ULL << m_msb;
547 DBG_OUT << "sorter: m_msb=" << (int)m_msb << " m_msb_mask=" << m_msb_mask << std::endl;
548 sort(x.begin(), sa.begin(), x.size() - 1, max_symbol + 1, min_symbol);
549 }
550
551 void sort(tIV & sa, char const * file_name, uint8_t num_bytes)
552 {
553 DBG_OUT << "sorter: sort(" << file_name << ")" << std::endl;
554 DBG_OUT << "sizeof(int_vector<>::difference_type)=" << sizeof(int_vector<>::difference_type) << std::endl;
555 util::clear(sa); // free space for sa
556 tIV x;
557 if (num_bytes == 0 and typeid(typename tIV::reference) == typeid(uint64_t))
558 {
559 DBG_OUT << "sorter: use int_vector<64>" << std::endl;
560 int_vector<> temp;
561 load_vector_from_file(temp, file_name, num_bytes);
562 x.resize(temp.size());
563 for (size_type i = 0; i < temp.size(); ++i)
564 x[i] = temp[i];
565 }
566 else
567 {
568 load_vector_from_file(x, file_name, num_bytes);
570 }
571 do_sort(sa, x);
572 }
573
574 template <class t_vec>
575 void sort(tIV & sa, t_vec & text)
576 {
577 tIV x;
578 x.resize(text.size());
579 for (size_type i = 0; i < text.size(); ++i)
580 x[i] = text[i];
581 do_sort(sa, x);
582 }
583};
584
585} // end namespace qsufsort
586
587} // end namespace sdsl
588
589#endif
bits.hpp contains the sdsl::bits class.
ptrdiff_t difference_type
int_vector< 64 >::size_type size() const noexcept
int64_t transform(int_iter const &x, int_iter const &p, int64_t n, int64_t k, int64_t l, int64_t q)
Definition qsufsort.hpp:350
void sort(int_iter const &x, int_iter const &p, int64_t n, int64_t k, int64_t l)
Definition qsufsort.hpp:424
void do_sort(tIV &sa, tIV &x)
Definition qsufsort.hpp:500
void sort(tIV &sa, t_vec &text)
Definition qsufsort.hpp:575
void sort(tIV &sa, char const *file_name, uint8_t num_bytes)
Definition qsufsort.hpp:551
int_vector.hpp contains the sdsl::int_vector class.
io.hpp contains some methods for reading/writing sdsl structures.
void construct_sa(int_vector_type &sa, char const *file, uint8_t num_bytes)
Construct a suffix array for the sequence stored in a file.
Definition qsufsort.hpp:72
void expand_width(t_int_vec &v, uint8_t new_width)
Expands the integer width to new_width >= v.width()
Definition util.hpp:530
void bit_compress(t_int_vec &v)
Bit compress the int_vector.
Definition util.hpp:502
Namespace for the succinct data structure library.
bool load_vector_from_file(t_int_vec &v, std::string const &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition io.hpp:192
#define DBG_OUT
Definition qsufsort.hpp:26
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition bits.hpp:194
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:653
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.