SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_CONSTRUCT
10#define INCLUDED_SDSL_CONSTRUCT
11
12#include <iosfwd>
13#include <stdexcept>
14#include <stdint.h>
15#include <string>
16#include <type_traits>
17
18#include <sdsl/config.hpp>
21#include <sdsl/construct_sa.hpp>
22#include <sdsl/int_vector.hpp>
25#include <sdsl/io.hpp>
27#include <sdsl/ram_fs.hpp>
29#include <sdsl/util.hpp>
30
31namespace sdsl
32{
33
34template <class int_vector>
35bool contains_no_zero_symbol(int_vector const & text, std::string const & file)
36{
37 for (int_vector_size_type i = 0; i < text.size(); ++i)
38 {
39 if ((uint64_t)0 == text[i])
40 {
41 throw std::logic_error(std::string("Error: File \"") + file + "\" contains zero symbol.");
42 return false;
43 }
44 }
45 return true;
46}
47
48template <class int_vector>
50{
51 text.resize(text.size() + 1);
52 text[text.size() - 1] = 0;
53}
54
55template <class t_index>
56void construct(t_index & idx, std::string file, uint8_t num_bytes = 0, bool move_input = false)
57{
58 tMSS file_map;
59 cache_config config;
60 if (is_ram_file(file))
61 {
62 config.dir = "@";
63 config.delete_data = move_input;
64 }
65 construct(idx, file, config, num_bytes);
66}
67
68template <class t_index, class t_data>
69void construct_im(t_index & idx, t_data && data, uint8_t num_bytes = 0)
70{
73 construct(idx, tmp_file, num_bytes, std::is_rvalue_reference<t_data &&>::value);
75}
76
78
87template <class t_index>
88void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes = 0)
89{
90 // delegate to CSA or CST construction
91 typename t_index::index_category index_tag;
92 construct(idx, file, config, num_bytes, index_tag);
93}
94
95// Specialization for WTs
96template <class t_index>
97void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes, wt_tag)
98{
99 auto event = memory_monitor::event("construct wavelet tree");
100 if ((t_index::alphabet_category::WIDTH == 8 and num_bytes <= 1)
101 or (t_index::alphabet_category::WIDTH == 0 and num_bytes != 'd'))
102 {
104 std::ios::in,
105 1024 * 1024,
106 num_bytes * 8,
107 (bool)num_bytes);
108 idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
109 }
110 else
111 {
113 load_vector_from_file(text, file, num_bytes);
114 std::string tmp_key = util::to_string(util::pid()) + "_" + util::to_string(util::id());
115 std::string tmp_file_name = cache_file_name(tmp_key, config);
116 store_to_file(text, tmp_file_name);
117 util::clear(text);
118 {
120 idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
121 }
122 sdsl::remove(tmp_file_name);
123 }
124}
125
126// Specialization for CSAs
127template <class t_index>
128void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes, csa_tag)
129{
130 auto event = memory_monitor::event("construct CSA");
131 constexpr uint8_t width = t_index::alphabet_category::WIDTH;
132 char const * KEY_TEXT = key_text_trait<width>::KEY_TEXT;
133 char const * KEY_BWT = key_bwt_trait<width>::KEY_BWT;
135 {
136 auto event = memory_monitor::event("parse input text");
137 // (1) check, if the text is cached
138 if (!cache_file_exists(KEY_TEXT, config))
139 {
140 text_type text;
141 load_vector_from_file(text, file, num_bytes);
142 if (contains_no_zero_symbol(text, file))
143 {
144 if (!is_ram_file(file))
145 {
146 append_zero_symbol(text);
147 store_to_cache(text, KEY_TEXT, config);
148 }
149 else
150 {
151 auto text_mapper = write_out_mapper<width>::create(cache_file_name(KEY_TEXT, config),
152 text.size() + 1,
153 text.width());
154 std::copy(text.begin(), text.end(), text_mapper.begin());
155 text_mapper[text.size()] = 0;
156 }
157 }
158 }
159 register_cache_file(KEY_TEXT, config);
160 }
161 if (config.delete_data)
162 {
163 sdsl::remove(file);
164 }
165 {
166 // (2) check, if the suffix array is cached
167 auto event = memory_monitor::event("SA");
168 if (!cache_file_exists(conf::KEY_SA, config))
169 {
170 construct_sa<t_index::alphabet_category::WIDTH>(config);
171 }
173 }
174 {
175 // (3) construct BWT
176 auto event = memory_monitor::event("BWT");
177 if (!cache_file_exists(KEY_BWT, config))
178 {
179 construct_bwt<t_index::alphabet_category::WIDTH>(config);
180 }
181 register_cache_file(KEY_BWT, config);
182 }
183 {
184 // (4) use BWT to construct the CSA
185 auto event = memory_monitor::event("construct CSA");
186 idx = t_index(config);
187 }
188 if (config.delete_files)
189 {
190 auto event = memory_monitor::event("delete temporary files");
191 util::delete_all_files(config.file_map);
192 }
193}
194
195// Specialization for standalone LCPs
196template <class t_index, uint8_t t_width>
197void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes, lcp_tag)
198{
199 auto event = memory_monitor::event("construct compressed LCP");
200 char const * KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
201 typedef int_vector<t_width> text_type;
202 {
203 // (2) check, if the longest common prefix array is cached
204 auto event = memory_monitor::event("LCP");
205 if (!cache_file_exists(conf::KEY_LCP, config))
206 {
207 {
208 auto event = memory_monitor::event("parse input text");
209 // (1) check, if the text is cached
210 if (!cache_file_exists(KEY_TEXT, config))
211 {
212 text_type text;
213 load_vector_from_file(text, file, num_bytes);
214 if (contains_no_zero_symbol(text, file))
215 {
216 append_zero_symbol(text);
217 store_to_cache(text, KEY_TEXT, config);
218 }
219 }
220 register_cache_file(KEY_TEXT, config);
221 }
222 {
223 // (2) check, if the suffix array is cached
224 auto event = memory_monitor::event("SA");
225 if (!cache_file_exists(conf::KEY_SA, config))
226 {
227 construct_sa<t_width>(config);
228 }
230 }
231 if (t_width == 8)
232 {
234 }
235 else
236 {
237 construct_lcp_PHI<t_width>(config);
238 }
239 }
241 }
242 {
243 auto event = memory_monitor::event("compressed LCP");
244 idx = t_index(config);
245 }
246 if (config.delete_files)
247 {
248 auto event = memory_monitor::event("delete temporary files");
249 util::delete_all_files(config.file_map);
250 }
251}
252
253// Specialization for standalone LCPs
254template <class t_index>
255void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes, lcp_tag tag)
256{
257 if (1 == num_bytes)
258 {
259 construct<t_index, 8>(idx, file, config, num_bytes, tag);
260 }
261 else
262 {
263 construct<t_index, 0>(idx, file, config, num_bytes, tag);
264 }
265}
266
267// Specialization for CSTs
268template <class t_index>
269void construct(t_index & idx, std::string const & file, cache_config & config, uint8_t num_bytes, cst_tag)
270{
271 auto event = memory_monitor::event("construct CST");
274 csa_tag csa_t;
275 {
276 // (1) check, if the compressed suffix array is cached
277 typename t_index::csa_type csa;
278 if (!cache_file_exists(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config))
279 {
280 cache_config csa_config(false, config.dir, config.id, config.file_map);
281 construct(csa, file, csa_config, num_bytes, csa_t);
282 auto event = memory_monitor::event("store CSA");
283 config.file_map = csa_config.file_map;
284 store_to_cache(csa, std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
285 }
286 register_cache_file(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
287 }
288 {
289 // (2) check, if the longest common prefix array is cached
290 auto event = memory_monitor::event("LCP");
291 register_cache_file(KEY_TEXT, config);
292 register_cache_file(KEY_BWT, config);
294 if (!cache_file_exists(conf::KEY_LCP, config))
295 {
296 if (t_index::alphabet_category::WIDTH == 8)
297 {
299 }
300 else
301 {
302 construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config);
303 }
304 }
306 }
307 {
308 auto event = memory_monitor::event("CST");
309 idx = t_index(config);
310 }
311 if (config.delete_files)
312 {
313 auto event = memory_monitor::event("delete temporary files");
314 util::delete_all_files(config.file_map);
315 }
316}
317
318} // end namespace sdsl
319#endif
A generic vector class for integers of width .
size_type size() const noexcept
The number of elements in the int_vector.
void resize(const size_type size)
Resize the int_vector in terms of elements.
static mm_event_proxy event(std::string const &name)
static int_vector_mapper< t_width > create(std::string const &key, cache_config &config)
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_lcp.hpp contains a space and time efficient construction method for lcp arrays
construct_sa.hpp contains an interface to access suffix array construction algorithms
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
memory_tracking.hpp contains two function for allocating and deallocating memory
constexpr char KEY_CSA[]
Definition config.hpp:37
constexpr char KEY_SA[]
Definition config.hpp:36
constexpr char KEY_LCP[]
Definition config.hpp:43
int remove(std::string const &name)
Remove the file with key name
Definition ram_fs.hpp:75
uint64_t id()
uint64_t pid()
std::string to_string(T const &t, int w=1)
Namespace for the succinct data structure library.
bool contains_no_zero_symbol(int_vector const &text, std::string const &file)
Definition construct.hpp:35
bool store_to_cache(T const &v, std::string const &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition io.hpp:804
bool cache_file_exists(std::string const &key, cache_config const &config)
Checks if the resource specified by the key exists in the cache.
Definition io.hpp:733
int remove(std::string const &)
Remove a file.
Definition ram_fs.hpp:221
std::map< std::string, std::string > tMSS
Definition config.hpp:49
std::string tmp_file(cache_config const &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition io.hpp:759
std::string cache_file_name(std::string const &key, cache_config const &config)
Returns the file name of the resource.
Definition io.hpp:688
std::string ram_file_name(std::string const &file)
Returns the corresponding RAM-file name for file.
Definition ram_fs.hpp:195
void register_cache_file(std::string const &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition io.hpp:717
bool load_vector_from_file(t_int_vec &v, std::string const &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition io.hpp:192
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition construct.hpp:56
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
bool is_ram_file(std::string const &file)
Determines if the given file is a RAM-file.
Definition ram_fs.hpp:176
bool store_to_file(T const &v, std::string const &file)
Store a data structure to a file.
Definition io.hpp:874
uint64_t int_vector_size_type
Definition config.hpp:47
void append_zero_symbol(int_vector &text)
Definition construct.hpp:49
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition construct.hpp:69
ram_fs.hpp
Contains declarations and definitions of data structure concepts.
Helper class for construction process.
Definition config.hpp:66
std::string id
Definition config.hpp:71
std::string dir
Definition config.hpp:70
Helper classes to transform width=0 and width=8 to corresponding bwt key.
Definition config.hpp:115
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition config.hpp:96
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.