tlx
Loading...
Searching...
No Matches
multikey_quicksort.hpp
Go to the documentation of this file.
1/*******************************************************************************
2 * tlx/sort/strings/multikey_quicksort.hpp
3 *
4 * Generic multikey quicksort for strings. This is an internal implementation
5 * header, see tlx/sort/strings.hpp for public front-end functions.
6 *
7 * Based on multikey quicksort, a quick sort algorithm for arrays of character
8 * strings by Bentley and Sedgewick.
9 *
10 * J. Bentley and R. Sedgewick. "Fast Algorithms for Sorting and Searching
11 * Strings." In Proceedings of 8th Annual ACM-SIAM Symposium on Discrete
12 * Algorithms, 1997.
13 *
14 * http://www.cs.princeton.edu/~rs/strings/index.html
15 *
16 * Part of tlx - http://panthema.net/tlx
17 *
18 * Copyright (C) 2015-2018 Timo Bingmann <tb@panthema.net>
19 *
20 * All rights reserved. Published under the Boost Software License, Version 1.0
21 ******************************************************************************/
22
23#ifndef TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
24#define TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
25
27
28#include <algorithm>
29#include <cstddef>
30#include <utility>
31
32namespace tlx {
33
34//! \addtogroup tlx_sort
35//! \{
36
37namespace sort_strings_detail {
38
39template <typename StringSet>
40static inline void vec_swap(
41 typename StringSet::Iterator a, typename StringSet::Iterator b, size_t n) {
42 while (n-- > 0)
43 std::swap(*a++, *b++);
44}
45
46template <typename StringSet>
47static inline typename StringSet::Iterator med3func(
48 const StringSet& ss,
49 typename StringSet::Iterator a, typename StringSet::Iterator b,
50 typename StringSet::Iterator c, size_t depth) {
51 typename StringSet::Char va = ss.get_char(*a, depth);
52 typename StringSet::Char vb = ss.get_char(*b, depth);
53 if (va == vb)
54 return a;
55 typename StringSet::Char vc = ss.get_char(*c, depth);
56 if (vc == va || vc == vb)
57 return c;
58 return va < vb
59 ? (vb < vc ? b : (va < vc ? c : a))
60 : (vb > vc ? b : (va < vc ? a : c));
61}
62
63/*!
64 * Generic multikey quicksort for strings. Based on multikey quicksort, a quick
65 * sort algorithm for arrays of character strings by Bentley and Sedgewick. This
66 * method requires up to O(maxlcp) memory due to the recursion stack and it runs
67 * in expected time O(D + n log n) and worst-case time O(D + n^2).
68 *
69 * J. Bentley and R. Sedgewick. Fast algorithms for sorting and searching
70 * strings. In Proceedings of 8th Annual ACM-SIAM Symposium on Discrete
71 * Algorithms, 1997.
72 */
73template <typename StringPtr>
74static inline void multikey_quicksort(
75 const StringPtr& strptr, size_t depth, size_t memory) {
76
77 typedef typename StringPtr::StringSet StringSet;
78 typedef typename StringSet::Iterator Iterator;
79
80 const StringSet& ss = strptr.active();
81 const Iterator a = ss.begin();
82 size_t n = ss.size();
83
84 // try to estimate the amount of memory in a stack frame
85 static const size_t memory_use =
86 2 * sizeof(size_t) + sizeof(StringSet) + 5 * sizeof(Iterator);
87
88 if (n < 32 || (memory != 0 && memory < memory_use + 1)) {
89 return insertion_sort(strptr, depth, memory);
90 }
91
92 ptrdiff_t r;
93 Iterator pa, pb, pc, pd, pn;
94
95 {
96 Iterator pl = a;
97 Iterator pm = a + (n / 2);
98 pn = a + (n - 1);
99 if (n > 30) {
100 // on big arrays: pseudomedian of 9
101 size_t d = (n / 8);
102 pl = med3func(ss, pl, pl + d, pl + 2 * d, depth);
103 pm = med3func(ss, pm - d, pm, pm + d, depth);
104 pn = med3func(ss, pn - 2 * d, pn - d, pn, depth);
105 }
106 pm = med3func(ss, pl, pm, pn, depth);
107 std::swap(*a, *pm);
108 int pivot = ss.get_char(*a, depth);
109 pa = pb = a + 1;
110 pc = pd = a + n - 1;
111 for ( ; ; ) {
112 while (pb <= pc && (r = static_cast<int>(ss.get_char(*pb, depth)) - pivot) <= 0) {
113 if (r == 0) std::swap(*pa++, *pb);
114 pb++;
115 }
116 while (pb <= pc && (r = static_cast<int>(ss.get_char(*pc, depth)) - pivot) >= 0) {
117 if (r == 0) std::swap(*pc, *pd--);
118 pc--;
119 }
120 if (pb > pc) break;
121 std::swap(*pb++, *pc--);
122 }
123 pn = a + n;
124
125 size_t pe_start_index, pe_end_index;
126 r = std::min(pa - a, pb - pa);
127 vec_swap<StringSet>(a, pb - r, r);
128 pe_start_index = r;
129 r = std::min(pd - pc, pn - pd - 1);
130 pe_end_index = (pn - a) - r;
131 vec_swap<StringSet>(pb, pn - r, r);
132 if (pivot == 0) {
133 for (size_t it = pe_start_index + 1; it < pe_end_index; ++it)
134 strptr.set_lcp(it, depth);
135 }
136 }
137
138 r = pb - pa;
139 if (r > 0) {
140 strptr.set_lcp(a - ss.begin() + r, depth);
141 }
142 if (r > 1) {
143 multikey_quicksort(strptr.sub(a - ss.begin(), r),
144 depth, memory - memory_use);
145 }
146 if (ss.get_char(*(a + r), depth) != 0) {
148 strptr.sub(a - ss.begin() + r, (pa - a) + (pn - pd - 1)),
149 depth + 1, memory - memory_use);
150 }
151 r = pd - pc;
152 if (r > 0) {
153 strptr.set_lcp(a - ss.begin() + n - r, depth);
154 }
155 if ((r = pd - pc) > 1) {
156 multikey_quicksort(strptr.sub(a - ss.begin() + n - r, r),
157 depth, memory - memory_use);
158 }
159}
160
161} // namespace sort_strings_detail
162
163//! \}
164
165} // namespace tlx
166
167#endif // !TLX_SORT_STRINGS_MULTIKEY_QUICKSORT_HEADER
168
169/******************************************************************************/
Objectified string array pointer array.
const StringSet & active() const
return currently active array
StringPtr sub(size_t offset, size_t sub_size) const
Advance (both) pointers by given offset, return sub-array.
void set_lcp(size_t, const LcpType &) const
set the i-th lcp to v and check its value
static void multikey_quicksort(const StringPtr &strptr, size_t depth, size_t memory)
Generic multikey quicksort for strings.
static enable_if<!StringPtr::with_lcp, void >::type insertion_sort(const StringPtr &strptr, size_t depth, size_t)
Generic insertion sort for abstract string sets.
static StringSet::Iterator med3func(const StringSet &ss, typename StringSet::Iterator a, typename StringSet::Iterator b, typename StringSet::Iterator c, size_t depth)
static void vec_swap(typename StringSet::Iterator a, typename StringSet::Iterator b, size_t n)