Halide  20.0.0
Halide compiler and libraries
LoopNest.h
Go to the documentation of this file.
1 /** This file defines the LoopNest, which is our
2  * representation of a Halide schedule, and contains methods to
3  * generate candidates for scheduling as well as extract a
4  * featurization that can be used to cost each candidate. */
5 
6 #ifndef LOOP_NEST_H
7 #define LOOP_NEST_H
8 
9 #include "ASLog.h"
10 #include "CostModel.h"
11 #include "FunctionDAG.h"
12 #include "GPULoopInfo.h"
13 #include "GPUMemInfo.h"
14 #include "PerfectHashMap.h"
15 #include "SearchSpaceOptions.h"
16 #include "Statistics.h"
17 #include "ThreadInfo.h"
18 #include "Tiling.h"
19 #include <set>
20 #include <vector>
21 
22 namespace Halide {
23 namespace Internal {
24 namespace Autoscheduler {
25 
26 template<typename T>
28 
29 template<typename T>
31 
32 enum class GPU_parallelism {
33  Block,
34  Thread,
35  Serial,
36  Simd,
38  None
39 };
40 
41 std::string stringify(GPU_parallelism label);
42 
43 // inlined => func is inlined so has no memory store location
44 enum class GPUMemoryType {
45  Global,
46  Shared,
47  Local,
48  Registers,
49  Inlined
50 };
51 
52 bool may_subtile(const Anderson2021Params &params);
53 
55 
57 
59 
61 
63  return 128;
64 }
65 
66 int get_unroll_limit(const Target &target);
67 
68 bool in_range_zero_one(double x);
69 
70 bool are_valid_thread_extents(const vector<int64_t> &counts);
71 
74 
75 bool all(const vector<int> &v);
76 bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
77 
78 // We're going to do a tree search over possible schedules to find an
79 // optimal one. A tree search requires a state, and a function that
80 // gives you children of the state (with costs). The following struct
81 // represents the state, which is a partial schedule.
82 //
83 // A partial schedule is a tree. Each node is some portion of the for
84 // loop nest of some Func. If there are no children, it's the
85 // innermost set of loops. If there are children, it's a loop over
86 // tiles of that Func.
87 struct LoopNest {
88  mutable RefCount ref_count;
89 
90  // The extents of this loop. Put another way, the number of tiles,
91  // not the size of each tile.
92  vector<int64_t> size;
93 
94  // The nodes inside the loop body
95  vector<IntrusivePtr<const LoopNest>> children;
96 
97  // Funcs inlined into this inner loop, and the number of times
98  // each is called. Only valid if children is empty.
100 
101  // Funcs stored inside this loop
102  std::set<const FunctionDAG::Node *> store_at;
103 
104  // The total bounds required of any given Func over all iterations
105  // of this loop. In the paper, this is represented using the
106  // little boxes to the left of the loop nest tree figures.
107  mutable NodeMap<Bound> bounds;
108 
109  // The Func this loop nest belongs to
110  const FunctionDAG::Node *node = nullptr;
111 
112  // The stage of the Func
113  const FunctionDAG::Node::Stage *stage = nullptr;
114 
115  // Is this the innermost loop of this func (the SIMD loop)?
116  bool innermost = false;
117 
118  // Are we permitted to tile this loop?
119  bool tileable = false;
120 
121  // Is this the parallel outer loop?
122  bool parallel = false;
123 
124  // What dimension is this Func vectorized over, in terms of the pure args of the Func?
125  int vector_dim = -1;
126 
127  // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
128  int vectorized_loop_index = -1;
129 
130  // Apply gpu threads to this loop nest
132 
135  double num_vectors;
136  double num_scalars;
137  double vector_size;
143  };
144 
145  mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
146  mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
147 
148  bool is_gpu_serial(const Target &target) const {
149  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Serial;
150  }
151 
152  bool is_gpu_thread(const Target &target) const {
153  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Thread;
154  }
155 
156  bool is_gpu_block(const Target &target) const {
157  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Block;
158  }
159 
160  bool is_scalar() const {
161  return size.empty();
162  }
163 
164  // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
165  // across all siblings of f.
166  vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
167 
168  // given a newly inserted node f into this LoopNest, gets the size of
169  // all of f's stages and their pure_dim indices
171  vector<vector<int64_t>> &stage_sizes,
172  vector<vector<int>> &pure_dims,
173  vector<int> &vectorized_indices) const;
174 
175  // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
176  // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
177  // if so, we will include these in the serial loop sizes
178  void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
179 
180  // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
181  // the newly inserted loop nests of f into a threads loop outside a serial loop.
182  // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
184  const Anderson2021Params &params,
185  const Target &target,
186  int v,
187  vector<IntrusivePtr<const LoopNest>> &result,
188  const vector<int64_t> &max_size);
189 
190  void copy_from(const LoopNest &n);
192 
193  static void hash_combine(uint64_t &h, uint64_t next) {
194  // From boost
195  h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
196  }
197 
198  // Hash the loop structure and sizes up to a fixed depth. This is
199  // used as the hash function for the coarse-to-fine beam search in
200  // the paper.
201  void structural_hash(uint64_t &h, int depth) const;
202 
203  // How many funcs are scheduled inside this loop level. Used in
204  // the structural hash.
205  size_t funcs_realized_or_inlined() const {
206  size_t count = inlined.size() + store_at.size();
207  for (const auto &c : children) {
208  count += c->funcs_realized_or_inlined();
209  }
210  return count;
211  }
212 
213  // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
214  struct Sites {
215  const LoopNest *compute = nullptr; // Its containing compute_at site
216  const LoopNest *store = nullptr; // Its containing store_at site
217  const LoopNest *produce = nullptr; // Its own outermost node
218  const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
219  const LoopNest *task = nullptr; // The parallel for loop it belongs to
220  const LoopNest *thread = nullptr; // Its containing gpu_thread loop
221  GPUMemoryType gpu_store_memory_type; // global, local, shared?
222  int64_t allocation_size = 0; // Allocation size in bytes
223  bool is_constant_allocation = false; // Does the allocation have constant size?
224  int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
225  bool inlined = false; // Is the Func inlined?
226  std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
228 
229  bool is_stored_in_global_mem() const {
231  }
232  bool is_stored_in_shared_mem() const {
234  }
235  bool is_stored_in_local_mem() const {
237  }
238  bool is_stored_in_registers() const {
240  }
241  };
242 
244  bool in_thread,
245  bool is_inlined = false) const;
246 
247  std::vector<int> unrolled_loops(const Target &target,
248  const LoopNest *parent,
249  const LoopNest *grandparent) const;
250 
252  StageMap<Sites> &sites,
253  NodeMap<bool> &can_be_promoted_to_registers,
254  const LoopNest *grandparent,
255  const LoopNest *parent) const;
256 
258  StageMap<Sites> &sites) const;
259 
260  // Compute all the sites of interest for each pipeline stage
261  void get_sites(const Target &target,
262  StageMap<Sites> &sites,
263  StageMap<int64_t> &shared_mem_alloc_sizes,
264  const LoopNest *task = nullptr,
265  const LoopNest *parent = nullptr,
266  const LoopNest *current_thread_loop = nullptr) const;
267 
268  // A helper for the working_set_at_task feature. Most features are
269  // computed in the recursive pass 'compute_features' below, but
270  // this one must be done in a second separate recursive pass.
273  for (const auto &c : children) {
274  c->set_working_set_at_task_feature(working_set, features);
275  features->get(c->stage).working_set_at_task = working_set;
276  }
277  }
278 
280  const LoopNest *parent,
281  bool in_threads_loop) const;
282 
284 
285  bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
286 
288 
290 
292 
293  // Get the stride over "node's" storage for a unit increment in the vectorized loop's
294  // index
295  double storage_stride(const LoadJacobian &jac,
296  int innermost_storage_dim,
297  const FunctionDAG::Node *storage_node,
298  const Bound &store_bounds,
299  const LoopNest &root) const;
300 
302  int innermost_storage_dim,
303  const FunctionDAG::Node *storage_node,
304  const Bound &store_bounds,
305  const ThreadInfo *thread_info,
306  bool verbose = false) const;
307 
309  const FunctionDAG::Node *storage_node,
310  const LoopNest &root) const;
311 
312  int get_actual_vector_dim(const Bound &store_bounds) const;
313 
315  int consumer_innermost_dim,
316  const FunctionDAG::Node *node,
317  const Bound &consumer_store_bounds,
318  const GPULoopInfo &gpu_loop_info,
319  const std::vector<int64_t> &inner_serial_loop_extents,
320  const Sites &consumer_site,
321  ScheduleFeatures &feat,
322  const LoopNest *parent,
323  const LoopNest &root,
324  GlobalMemInfo &global_mem_loads,
325  SharedMemInfo &shared_mem_loads,
326  LocalMemInfo &local_mem_loads,
327  bool verbose = false) const;
328 
330  const FunctionDAG::Node *accessed,
331  int innermost_dim,
332  int loop_index) const;
333 
335  const FunctionDAG::Node *accessed,
336  bool accessed_has_been_scheduled,
337  int innermost_dim,
338  int loop_index,
339  const GPUMemoryType &mem_type) const;
340 
342  const FunctionDAG::Node *accessed,
343  bool accessed_has_been_scheduled,
344  int innermost_dim,
345  const GPUMemoryType &mem_type,
346  bool verbose = false) const;
347 
348  int vectorized_access_size(size_t loop_index,
349  bool verbose = false) const;
350 
351  template<typename T>
353  const FunctionDAG::Node *node,
354  const Bound &store_bounds,
355  const ThreadInfo *thread_info,
356  int innermost_dim,
357  double num_requests_per_warp,
358  MemInfoType<T> &mem_info,
359  bool verbose = false) const;
360 
361  std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac,
362  int consumer_innermost_dim,
363  const FunctionDAG::Node *node,
364  const Bound &consumer_store_bounds,
365  const LoopNest &root,
366  double serial_loop_extents) const;
367 
368  template<typename T>
370  int consumer_innermost_dim,
371  const FunctionDAG::Node *node,
372  const Bound &consumer_store_bounds,
373  const ThreadInfo *thread_info,
374  double serial_loop_extents,
375  bool verbose) const;
376 
377  template<typename T>
379  int producer_innermost_dim,
380  const FunctionDAG::Node *node,
381  const Bound &producer_store_bounds,
382  bool producer_has_been_scheduled,
383  const ThreadInfo *thread_info,
384  MemInfoType<T> &mem_info,
385  double serial_loop_extents,
386  bool verbose = false) const;
387 
388  double compute_local_mem_stride(double stride,
389  double bytes) const;
390 
391  // Assumes block, serial, thread or block, thread nesting
392  const LoopNest *get_enclosing_block(const LoopNest *parent,
393  const LoopNest *grandparent) const;
394 
395  std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
396 
398 
400 
402  const GPULoopInfo &gpu_loop_info) const;
403 
404  // Assume that when a block is active, all its warps are active
406  ScheduleFeatures &feat,
407  const GPULoopInfo &gpu_loop_info) const;
408 
410  const Target &target,
411  int64_t total_shared_mem_alloc_size,
412  ScheduleFeatures &feat) const;
413 
414  std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
415 
417  const Target &target,
418  const GPULoopInfo &gpu_loop_info,
419  const std::vector<const FunctionDAG::Edge *> &edge_chain,
420  const LoadJacobian &jac,
421  const LoopNest *parent,
422  const LoopNest *grandparent,
423  int64_t n,
424  const ScheduleFeatures &feat,
425  const LoadJacobian &serial_jac,
426  bool producer_has_been_scheduled,
427  int producer_innermost_dim,
428  const GPUMemoryType &mem_type,
429  bool verbose) const;
430 
432  const LoopNest *parent,
433  const ScheduleFeatures &feat,
434  const LoadJacobian &jac,
435  int producer_dims) const;
436 
438  const StageMap<ScheduleFeatures> *features) const;
439 
440  vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
441 
443 
444  void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
445 
447  const StageMap<ScheduleFeatures> *features) const;
448 
450  const StageMap<ScheduleFeatures> *features) const;
451 
454 
455  std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
456 
457  // Do a recursive walk over the loop nest computing features to feed the cost model.
458  void compute_features(const FunctionDAG &dag,
459  const Anderson2021Params &params,
460  const Target &target,
461  const StageMap<Sites> &sites,
462  int64_t instances,
463  int64_t parallelism,
464  const LoopNest *parent,
465  const LoopNest *grandparent,
466  const LoopNest &root,
467  GPULoopInfo gpu_loop_info,
468  bool use_memoized_features,
469  const StageMap<int64_t> &total_shared_mem_alloc_sizes,
470  int64_t *working_set,
471  int64_t *working_set_local_constant,
472  int64_t *working_set_local_dynamic,
474  Statistics &stats,
475  bool verbose = false) const;
476 
477  bool is_root() const {
478  // The root is the sole node without a Func associated with
479  // it.
480  return node == nullptr;
481  }
482 
483  // Set the region required of a Func at this site.
484  const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
485  return bounds.emplace(f, b);
486  }
487 
488  // Get the region required of a Func at this site, from which we
489  // know what region would be computed if it were scheduled here,
490  // and what its loop nest would be.
491  const Bound &get_bounds(const FunctionDAG::Node *f) const;
492 
493  // Get the region required of a Func at this site (but only to satisfy the
494  // consumers along the given edge chain), from which we know what region
495  // would be computed if it were scheduled here and what its loop nest
496  // would be.
498  const vector<const FunctionDAG::Edge *> &edge_chain) const;
499 
500  void dump() const;
501 
502  std::string to_string() const;
503 
504  // Recursively print a loop nest representation to stderr
505  template<typename T>
506  void dump(T &stream, string prefix, const LoopNest *parent) const;
507 
508  // Does this loop nest access the given Func
509  bool calls(const FunctionDAG::Node *f) const;
510 
511  // What is the maximum number of inlined calls to a Func that
512  // occur within this loop. Used to prune states that would
513  // generate too much code.
515 
516  // Does this loop nest access an input buffer? Used to select
517  // trail strategies when splitting loops. We don't want to read
518  // out of bounds on inputs, even if we don't intend to use the
519  // values read. It could create annoying assertion failures for
520  // the user. It's OK to read out of range of the values computed
521  // on internal Funcs though. Allocation bounds inference just pads
522  // out the bounds so that it won't fault.
523  bool accesses_input_buffer() const;
524 
525  // Does this loop nest contain a computation of the given Func.
526  bool computes(const FunctionDAG::Node *f) const;
527 
528  // Above here most methods query the loop nest. Below we have
529  // methods that mutate the loop nest.
530 
531  // Inline a Func into all consumers within this loop.
533 
534  // Compute a Func at this site.
536  bool tileable,
537  int v,
538  bool in_threads_loop,
539  const Anderson2021Params &params,
540  const Target &target);
541 
542  // Parallelize this loop according to the given tiling.
543  IntrusivePtr<const LoopNest> parallelize_in_tiles(const vector<int64_t> &tiling,
544  const LoopNest *parent,
545  const Anderson2021Params &params,
546  const Target &target,
547  bool inner_tiling,
548  bool adjust_tiling,
549  bool move_all_rvars_inward = true,
550  const vector<int> &rvars_to_move_inward = {}) const;
551 
552  int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false,
553  bool in_threads_loop = false) const;
555 
556  // All store ats further in than the block level must be fixed
557  // sized allocations. This method checks if f will require a dynamic
558  // allocation
560  const Target &target,
561  bool in_threads_loop) const;
562 
563  // Return all possible ways to compute f in tiles somewhere within
564  // this loop nest.
565  // in_threads_loop tracks whether or not function is going to be placed inside a
566  // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
567  vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
568  const LoopNest *parent,
569  const Anderson2021Params &params,
570  const Target &target,
571  const SearchSpaceOptions &search_space_options,
572  int v,
573  bool in_realization,
574  bool in_threads_loop,
575  bool is_pre_pass,
576  vector<int64_t> union_counts = vector<int64_t>()) const;
577 
578  // Below here we have methods that apply a schedule to a Halide pipeline.
579 
580  // A model of the state of the loop nest of a Func while applying
581  // Halide's scheduling directives.
582 
583  // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
584  struct StageScheduleState {
585  // How much parallelism do we need to exploit with this Func?
586  double num_cores = 0;
587 
588  // Which storage dimension is vectorized? We need to reorder it innermost
589  int vector_dim = -1;
590  int vectorized_loop_index = -1;
591 
592  // The various Vars and RVars used for scheduling a Func.
593  struct FuncVar {
594  // The top-level var or rvar this was split off from
595  VarOrRVar orig;
596 
597  // This var.
598  VarOrRVar var;
599 
600  // Source code to access this Var/RVar. Used for printing
601  // valid Halide source for this schedule.
602  string accessor;
603 
604  // Our estimate of the extent of this var. This is exact
605  // when constant_extent flag is true.
606  int64_t extent = 0;
607 
608  // Which index in the symbolic loop nest does this var
609  // belong to.
610  size_t index = 0;
611 
612  // Some flags.
613  bool innermost_pure_dim = false;
614  bool outermost = false;
615  bool parallel = false;
616  bool exists = false;
617  bool pure = false;
618  bool constant_extent = false;
619 
620  bool vectorized = false;
621  bool gpu_threads = false;
622 
624  : orig(Var()),
625  var(Var()) {
626  }
627  };
630  bool parallel = false;
631  bool vectorized = false;
634 
635  // In order from innermost to outermost. Each group of d is one tiling level.
636  vector<FuncVar> vars;
637 
638  // In order from innermost to outermost. Each group of d is one tiling level.
639  vector<FuncVar> ordered_vars;
640  vector<int64_t> gpu_thread_extents;
641 
644 
645  // From outermost in
646  vector<StageScheduleState *> ancestors;
647 
648  std::ostringstream schedule_source;
649  };
650 
655  int num_serial_loops() const;
657 
659  const NodeMap<bool> &all_inlined) const;
661  const LoopNest *parent) const;
662 
663  // Apply the schedule represented by this loop nest to a Halide pipeline.
664  void apply(LoopLevel here,
665  StageMap<std::unique_ptr<StageScheduleState>> &state_map,
666  double num_cores,
667  int depth,
668  const LoopNest *parent,
669  const LoopNest *compute_site,
670  const Target &target,
671  std::vector<StageScheduleState *> &ancestors,
672  const NodeMap<bool> &all_inlined) const;
673 
674  double max_idle_lane_wastage(const Target &target,
675  GPULoopInfo gpu_loop_info) const;
676 
678 
680  NodeMap<bool> &inlined_nodes) const;
681 
682  void collect_all_inlined(NodeMap<bool> &all_inlined) const;
683 
685  int64_t product_of_descendants(int loop_index) const;
686 
688  const LoopNest *compute_root_loop_nest = nullptr) const;
689 };
690 
691 struct Filter {
693  bool logging = false;
694 
695  explicit Filter(const LoopNest *loop_nest)
696  : loop_nest{loop_nest},
698  if (logging) {
699  std::cerr << "\nState filtered: \n";
700  loop_nest->dump();
701  std::cerr << "Reason: ";
702  }
703  }
704 
705  template<typename T>
706  Filter &operator<<(T &&x) {
707  if (logging) {
708  std::cerr << std::forward<T>(x);
709  }
710  return *this;
711  }
712 
713  static bool enable_filter_printing();
714 };
715 
716 } // namespace Autoscheduler
717 } // namespace Internal
718 } // namespace Halide
719 
720 #endif // LOOP_NEST_H
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:203
A Halide variable, to be used when defining functions.
Definition: Var.h:19
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
bool all(const vector< int > &v)
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
Definition: LoopNest.h:62
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition: LoopNest.h:24
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition: LoopNest.h:21
int64_t get_shared_memory_sm_limit(const Anderson2021Params &params)
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
bool may_subtile(const Anderson2021Params &params)
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params &params)
std::string stringify(GPU_parallelism label)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
Filter(const LoopNest *loop_nest)
Definition: LoopNest.h:695
std::vector< const LoopNest * > inlined_innermosts
Definition: LoopNest.h:226
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition: LoopNest.h:643
bool is_gpu_thread(const Target &target) const
Definition: LoopNest.h:152
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t >> &stage_sizes, vector< vector< int >> &pure_dims, vector< int > &vectorized_indices) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition: LoopNest.h:146
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool >> &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
const FunctionDAG::Node * node
Definition: LoopNest.h:57
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:484
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState >> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
Definition: LoopNest.h:156
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition: LoopNest.h:145
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:271
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
bool is_gpu_serial(const Target &target) const
Definition: LoopNest.h:148
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:193
void dump(std::ostream &os, string prefix, const LoopNest *parent) const
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:60
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
const Bound & get_bounds(const FunctionDAG::Node *f) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:95
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:49
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest >> &result, const vector< int64_t > &max_size)
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:71
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
A class that can represent Vars or RVars.
Definition: Func.h:29