SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
wt_ap.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_AP
10#define INCLUDED_SDSL_WT_AP
11
12#include <algorithm>
13#include <assert.h>
14#include <iosfwd>
15#include <iterator>
16#include <memory>
17#include <stdint.h>
18#include <string>
19#include <tuple>
20#include <type_traits>
21#include <utility>
22#include <vector>
23
24#include <sdsl/bits.hpp>
25#include <sdsl/cereal.hpp>
26#include <sdsl/int_vector.hpp>
28#include <sdsl/io.hpp>
29#include <sdsl/iterators.hpp>
31#include <sdsl/ram_fs.hpp>
35#include <sdsl/util.hpp>
36#include <sdsl/wm_int.hpp>
37#include <sdsl/wt_huff.hpp>
38
40namespace sdsl
41{
42
44
57template <class t_wt_byte = wt_huff<bit_vector, rank_support_v5<>>, class t_wt_int = wm_int<>>
58class wt_ap
59{
60 static_assert(std::is_same<typename index_tag<t_wt_byte>::type, wt_tag>::value,
61 "First template argument has to be a wavelet tree.");
62 static_assert(std::is_same<typename index_tag<t_wt_int>::type, wt_tag>::value,
63 "Second template argument has to be a wavelet tree.");
64
65public:
71 typedef t_wt_byte wt_byte_type;
72 typedef t_wt_int wt_int_type;
75 enum
76 {
77 lex_ordered = 0
78 };
79
80protected:
82 value_type m_sigma = 0; //<- \f$ |\Sigma| \f$
87 std::vector<wt_int_type> m_offset;
88
89private:
90 // retrieves a character's class and offset - if the character exists in the text
91 inline std::tuple<bool, value_type, value_type> try_get_char_class_offset(value_type c) const
92 {
93 if (c >= m_char2class.size())
94 { // c is greater than any symbol in text
95 return std::make_tuple(false, 0, 0);
96 }
97 auto offset_class = m_char2class.inverse_select(c);
98 if (offset_class.second == m_class_cnt)
99 { // c never occurs in text
100 return std::make_tuple(false, 0, 0);
101 }
102 return std::make_tuple(true, offset_class.second, offset_class.first);
103 }
104
105public:
107
110 {}
111
113
118 template <typename t_it>
119 wt_ap(t_it begin, t_it end, std::string tmp_dir = ram_file_name("")) : m_size(std::distance(begin, end))
120 {
121 const uint8_t wt_byte_width = wt_byte_type::alphabet_category::WIDTH;
122 const uint8_t wt_int_width = wt_int_type::alphabet_category::WIDTH;
123
124 // calculate effective sigma and character frequencies
125 value_type max_symbol = 0;
126 std::vector<std::pair<size_type, value_type>> char_freq;
127 value_type pseudo_entries = 0;
128 {
129 auto event = memory_monitor::event("char freq");
130 for (auto it = begin; it != end; ++it)
131 {
132 value_type element = *it;
133 while (element >= max_symbol)
134 {
135 char_freq.emplace_back(0, max_symbol);
136 max_symbol++;
137 pseudo_entries++;
138 }
139 if (char_freq[element].first == 0)
140 {
141 pseudo_entries--;
142 }
143 char_freq[element].first++;
144 }
145 std::sort(char_freq.rbegin(), char_freq.rend());
146 m_sigma = max_symbol - pseudo_entries;
147 }
148
149 m_singleton_class_cnt = std::min(max_symbol, (value_type)bits::hi(m_sigma));
151
152 std::vector<std::pair<std::string, int_vector_buffer<wt_int_width>>> temp_file_offset_buffers;
153
154 // assign character classes
155 int_vector<wt_byte_width> m_char2class_buffer(max_symbol, m_class_cnt, bits::hi(m_class_cnt + 1) + 1);
156 for (value_type i = 0; i < m_singleton_class_cnt; ++i)
157 {
158 m_char2class_buffer[char_freq[i].second] = i;
159 }
160 value_type current_symbol = m_singleton_class_cnt;
161 value_type class_size = 1;
162 {
163 auto event = memory_monitor::event("char2class");
165 {
166 class_size <<= 1;
167 value_type offset = 0;
168 for (; offset < class_size && current_symbol < m_sigma; ++offset, ++current_symbol)
169 {
170 m_char2class_buffer[char_freq[current_symbol].second] = i;
171 }
172
173 std::string temp_file_offset = tmp_dir + "_wt_ap_offset_" + util::to_string(i - m_singleton_class_cnt)
175 temp_file_offset_buffers.emplace_back(temp_file_offset,
176 int_vector_buffer<wt_int_width>(temp_file_offset,
177 std::ios::out,
178 1024 * 1024,
179 bits::hi(offset) + 1));
180 }
181 char_freq.clear();
182 construct_im(m_char2class, m_char2class_buffer);
183 }
184
185 // calculate text-order classes and offsets
186 std::string temp_file_class =
187 tmp_dir + "_wt_ap_class_" + util::to_string(util::pid()) + "_" + util::to_string(util::id());
188 int_vector_buffer<wt_byte_width> class_buffer(temp_file_class,
189 std::ios::out,
190 1024 * 1024,
191 bits::hi(m_class_cnt) + 1);
192 {
193 auto event = memory_monitor::event("write class and offset");
194 for (auto it = begin; it != end; ++it)
195 {
196 value_type ch = *it;
197 value_type cl = m_char2class_buffer[ch];
198 class_buffer.push_back(cl);
199 if (cl >= m_singleton_class_cnt)
200 {
201 value_type offset = m_char2class.rank(ch, cl);
203 temp_file_offset_buffers[cl].second.push_back(offset);
204 }
205 }
206 class_buffer.close();
207 }
208
209 {
210 auto event = memory_monitor::event("class WT");
211 int_vector_buffer<wt_byte_width> class_buffer(temp_file_class);
212 m_class = wt_byte_type(class_buffer.begin(), class_buffer.end(), tmp_dir);
213 }
214 sdsl::remove(temp_file_class);
215 {
216 auto event = memory_monitor::event("offset WTs");
218 for (value_type i = 0; i < m_class_cnt - m_singleton_class_cnt; ++i)
219 {
220 auto & temp_file_offset_buffer = temp_file_offset_buffers[i];
221 temp_file_offset_buffer.second.close();
222 {
223 int_vector_buffer<wt_int_width> offset_buffer(temp_file_offset_buffer.first);
224 m_offset[i] = wt_int_type(offset_buffer.begin(), offset_buffer.end(), tmp_dir);
225 }
226 sdsl::remove(temp_file_offset_buffer.first);
227 }
228 }
229 }
230
241
244 {
245 *this = std::move(wt);
246 }
247
249 wt_ap & operator=(wt_ap const & wt)
250 {
251 if (this != &wt)
252 {
253 wt_ap tmp(wt);
254 *this = std::move(tmp);
255 }
256 return *this;
257 }
258
261 {
262 if (this != &wt)
263 {
264 m_size = wt.m_size;
265 m_sigma = wt.m_sigma;
266 m_singleton_class_cnt = wt.m_singleton_class_cnt;
267 m_class_cnt = wt.m_class_cnt;
268 m_char2class = std::move(wt.m_char2class);
269 m_class = std::move(wt.m_class);
270 m_offset = std::move(wt.m_offset);
271 }
272 return *this;
273 }
274
277 {
278 return m_size;
279 }
280
282 bool empty() const
283 {
284 return m_size == 0;
285 }
286
288
298 {
299 assert(i < size());
300 auto textoffset_class = m_class.inverse_select(i);
301 auto cl = textoffset_class.second;
302 value_type offset =
303 cl < m_singleton_class_cnt ? 0 : m_offset[cl - m_singleton_class_cnt][textoffset_class.first];
304 return m_char2class.select(offset + 1, cl);
305 };
306
308
320 {
321 assert(i <= size());
322 auto success_class_offset = try_get_char_class_offset(c);
323 if (!std::get<0>(success_class_offset))
324 {
325 return 0;
326 }
327 auto cl = std::get<1>(success_class_offset);
328 auto offset = std::get<2>(success_class_offset);
329 size_type count = m_class.rank(i, cl);
330 return cl < m_singleton_class_cnt ? count : m_offset[cl - m_singleton_class_cnt].rank(count, offset);
331 };
332
334
340 std::pair<size_type, value_type> inverse_select(size_type i) const
341 {
342 assert(i < size());
343
344 auto textoffset_class = m_class.inverse_select(i);
345 auto textoffset = textoffset_class.first;
346 auto cl = textoffset_class.second;
347 if (cl < m_singleton_class_cnt)
348 {
349 return std::make_pair(textoffset, m_char2class.select(1, cl));
350 }
351 auto class_result = m_offset[cl - m_singleton_class_cnt].inverse_select(textoffset);
352 return std::make_pair(class_result.first, m_char2class.select(class_result.second + 1, cl));
353 }
354
356
367 {
368 assert(1 <= i and i <= rank(size(), c));
369 auto success_class_offset = try_get_char_class_offset(c);
370 if (!std::get<0>(success_class_offset))
371 {
372 return m_size;
373 }
374 auto cl = std::get<1>(success_class_offset);
375 auto offset = std::get<2>(success_class_offset);
376 size_type text_offset =
377 cl < m_singleton_class_cnt ? i : 1 + m_offset[cl - m_singleton_class_cnt].select(i, offset);
378 return m_class.select(text_offset, cl);
379 };
380
382 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
383 {
384 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
385 size_type written_bytes = 0;
386 written_bytes += write_member(m_size, out, child, "size");
387 written_bytes += write_member(m_sigma, out, child, "sigma");
388 written_bytes += write_member(m_singleton_class_cnt, out, child, "singleton_classes");
389 written_bytes += write_member(m_class_cnt, out, child, "classes");
390 written_bytes += m_char2class.serialize(out, child, "char2class");
391 written_bytes += m_class.serialize(out, child, "class");
392 for (value_type i = 0; i < m_offset.size(); ++i)
393 {
394 written_bytes += m_offset[i].serialize(out, child, "offset");
395 }
396 structure_tree::add_size(child, written_bytes);
397 return written_bytes;
398 }
399
401 void load(std::istream & in)
402 {
403 read_member(m_size, in);
404 read_member(m_sigma, in);
407 m_char2class.load(in);
408 m_class.load(in);
410 m_offset.resize(offset_size);
411 for (value_type i = 0; i < offset_size; ++i)
412 {
413 m_offset[i].load(in);
414 }
415 }
416
418 template <typename archive_t>
419 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
420 {
421 ar(CEREAL_NVP(m_size));
422 ar(CEREAL_NVP(m_sigma));
426 ar(CEREAL_NVP(m_class));
427 ar(CEREAL_NVP(m_offset));
428 }
429
431 template <typename archive_t>
432 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
433 {
434 ar(CEREAL_NVP(m_size));
435 ar(CEREAL_NVP(m_sigma));
439 ar(CEREAL_NVP(m_class));
440 ar(CEREAL_NVP(m_offset));
441 }
442
444 {
445 return {this, 0};
446 };
448 {
449 return {this, size()};
450 };
452 {
453 return {this, 0};
454 };
456 {
457 return {this, size()};
458 };
459
461 bool operator==(wt_ap const & other) const noexcept
462 {
463 return (m_size == other.m_size) && (m_sigma == other.m_sigma)
464 && (m_singleton_class_cnt == other.m_singleton_class_cnt) && (m_class_cnt == other.m_class_cnt)
465 && (m_char2class == other.m_char2class) && (m_class == other.m_class) && (m_offset == other.m_offset);
466 }
467
469 bool operator!=(wt_ap const & other) const noexcept
470 {
471 return !(*this == other);
472 }
473};
474
475} // end namespace sdsl
476#endif
bits.hpp contains the sdsl::bits class.
cereal.hpp offers cereal support
#define CEREAL_NVP(X)
Definition cereal.hpp:31
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
A generic vector class for integers of width .
static mm_event_proxy event(std::string const &name)
Generic iterator for a random access container.
Definition iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, std::string const &name, std::string const &type)
static void add_size(structure_tree_node *v, uint64_t value)
A wavelet tree class for integer sequences.
Definition wt_ap.hpp:59
std::vector< wt_int_type > m_offset
Definition wt_ap.hpp:87
wt_byte_type m_class
Definition wt_ap.hpp:86
t_wt_int wt_int_type
Definition wt_ap.hpp:72
iterator begin()
Definition wt_ap.hpp:443
const_iterator end() const
Definition wt_ap.hpp:455
bool operator!=(wt_ap const &other) const noexcept
Inequality operator.
Definition wt_ap.hpp:469
wt_ap(wt_ap const &wt)
Copy constructor.
Definition wt_ap.hpp:232
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition wt_ap.hpp:419
bool operator==(wt_ap const &other) const noexcept
Equality operator.
Definition wt_ap.hpp:461
wt_tag index_category
Definition wt_ap.hpp:73
size_type m_size
Definition wt_ap.hpp:81
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition wt_ap.hpp:432
size_type const & sigma
Definition wt_ap.hpp:106
wt_ap & operator=(wt_ap &&wt)
Assignment move operator.
Definition wt_ap.hpp:260
wt_ap(wt_ap &&wt)
Move copy constructor.
Definition wt_ap.hpp:243
const_iterator end()
Definition wt_ap.hpp:447
size_type select(size_type i, value_type c) const
Calculates the i-th occurrence of the symbol c in the supported vector.
Definition wt_ap.hpp:366
@ lex_ordered
Definition wt_ap.hpp:77
value_type m_singleton_class_cnt
Definition wt_ap.hpp:83
wt_byte_type m_char2class
Definition wt_ap.hpp:85
wt_ap(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition wt_ap.hpp:119
bool empty() const
Returns whether the wavelet tree contains no data.
Definition wt_ap.hpp:282
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1] of the supported vector.
Definition wt_ap.hpp:319
void load(std::istream &in)
Loads the data structure from the given istream.
Definition wt_ap.hpp:401
int_vector ::difference_type difference_type
Definition wt_ap.hpp:68
int_vector ::size_type size_type
Definition wt_ap.hpp:66
iterator begin() const
Definition wt_ap.hpp:451
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition wt_ap.hpp:297
wt_ap & operator=(wt_ap const &wt)
Assignment operator.
Definition wt_ap.hpp:249
int_alphabet_tag alphabet_category
Definition wt_ap.hpp:74
wt_ap()
Default constructor.
Definition wt_ap.hpp:109
t_wt_byte wt_byte_type
Definition wt_ap.hpp:71
const_iterator iterator
Definition wt_ap.hpp:70
value_type m_class_cnt
Definition wt_ap.hpp:84
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition wt_ap.hpp:382
int_vector ::value_type value_type
Definition wt_ap.hpp:67
random_access_const_iterator< wt_ap > const_iterator
Definition wt_ap.hpp:69
value_type m_sigma
Definition wt_ap.hpp:82
size_type size() const
Returns the size of the original vector.
Definition wt_ap.hpp:276
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many occurrences of symbol wt[i] are in the prefix [0..i-1] of the original sequence.
Definition wt_ap.hpp:340
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
iterators.hpp contains an generic iterator for random access containers.
memory_tracking.hpp contains two function for allocating and deallocating memory
uint64_t id()
uint64_t pid()
std::string to_string(T const &t, int w=1)
Namespace for the succinct data structure library.
int remove(std::string const &)
Remove a file.
Definition ram_fs.hpp:221
std::string ram_file_name(std::string const &file)
Returns the corresponding RAM-file name for file.
Definition ram_fs.hpp:195
size_t write_member(T const &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition io.hpp:94
uint64_t count(t_k2_treap const &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
void read_member(T &t, std::istream &in)
Definition io.hpp:119
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition construct.hpp:69
ram_fs.hpp
rank_support_v5.hpp contains rank_support_v5.5
Contains declarations and definitions of data structure concepts.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:653
structure_tree.hpp contains a helper class which can represent the memory structure of a class.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wm_int.hpp contains a specialized class for a wavelet tree for sequences over large alphabets.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.