Halide  20.0.0
Halide compiler and libraries
State.h
Go to the documentation of this file.
1 #ifndef STATE_H
2 #define STATE_H
3 
4 #include "ASLog.h"
5 #include "CostModel.h"
6 #include "DefaultCostModel.h"
7 #include "Featurization.h"
8 #include "FunctionDAG.h"
9 #include "LoopNest.h"
10 #include "PerfectHashMap.h"
11 #include <set>
12 #include <unordered_set>
13 #include <vector>
14 
15 namespace Halide {
16 namespace Internal {
17 namespace Autoscheduler {
18 
19 using std::map;
20 using std::pair;
21 using std::set;
22 using std::string;
23 using std::unordered_set;
24 using std::vector;
25 
27 
29 
31 
32 constexpr int kLocalMemoryLimit = 524288; // 512 KB
33 
34 // Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35 // = 103232 bytes
36 // Not all 103232 bytes will be free for allocations so reduce it by factor to
37 // allow a buffer
39 
41 
43 
44 struct NoOpMutator {
45  void operator()(LoopNest *new_loop_nest) const {
46  }
47 };
48 
49 template<typename PostCreateMutator>
50 void deep_copy_loop_nest(LoopNest *new_loop_nest,
51  const LoopNest *new_loop_nest_parent,
52  const IntrusivePtr<const LoopNest> &existing_loop_nest,
53  const PostCreateMutator &post_create_mutator) {
54  new_loop_nest->copy_from(*existing_loop_nest);
55 
56  for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
57  LoopNest *new_child = new LoopNest;
58  new_loop_nest->children[i] = new_child;
59  deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
60  }
61 
62  post_create_mutator(new_loop_nest);
63 }
64 
65 using LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>;
66 
67 template<typename PostCreateMutator>
69  const PostCreateMutator &post_create_mutator) {
70  LoopNest *new_loop_nest = new LoopNest;
71  deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
72  return new_loop_nest;
73 }
74 
75 struct State {
76  mutable RefCount ref_count;
79  double cost = 0;
80  std::vector<double> cost_per_stage;
82  int num_decisions_made = 0;
83  bool penalized = false;
84  string schedule_source;
85 
86  State() = default;
87  State(const State &) = delete;
88  State(State &&) = delete;
89  void operator=(const State &) = delete;
90  void operator=(State &&) = delete;
91 
92  uint64_t structural_hash(int depth) const;
93 
94  // Compute the parent and depth of every loop nest node
96  const LoopNest *here,
97  int depth) const;
98 
100  const LoopNest *a,
101  const LoopNest *b) const;
102 
103  // We use the post_create_mutator so that the loop nests can be modified
104  // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
105  template<typename PostCreateMutator>
106  LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const {
107  LoopNest *new_root = new LoopNest;
108  deep_copy_loop_nest<PostCreateMutator>(new_root, nullptr, root, post_create_mutator);
109  return new_root;
110  }
111 
113 
115 
118  const Target &target;
119 
120  void operator()(LoopNest *new_loop_nest) const;
121 
122  // In phase 2, any compute_root loop marked 'none' will be split into
123  // blocks, threads, and serial loops. To enable the cost model to make a
124  // meaningful prediction on these pre-split loops, we assume a split into
125  // blocks and threads with a single full warp (if possible)
126  void split_compute_root_loops(LoopNest *loop_nest) const;
127 
128  // If a loop nest does not have thread loops, split the outermost serial
129  // loops to create thread loops with extents 1
130  void add_outer_thread_loops(LoopNest *loop_nest) const;
131  };
132 
134  const Target &target) const;
135 
137  const LoopNest *loop,
138  LoopNest::Sites &site) const;
139 
141  const Anderson2021Params &params,
142  const Target &target,
143  StageMap<ScheduleFeatures> *features,
144  Statistics &stats,
145  bool verbose = false) const;
146 
148  const Anderson2021Params &params,
149  const Target &target,
150  std::ostream &out) const;
151 
152  bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at,
153  const IntrusivePtr<const LoopNest> &parent) const;
154 
155  // For GPU, only allow store_at root or inside the outermost loop nest. Any
156  // store_ats further in will be hoisted and expanded, increasing the
157  // amount of shared memory required.
159 
161 
162  bool exceeds_serial_extents_limit(const Target &target) const;
163 
165  const LoopNest *loop) const;
166 
168  const Target &target) const;
169 
171  const Target &target) const;
172 
173  bool calculate_cost(const FunctionDAG &dag,
174  const Anderson2021Params &params,
175  const Target &target,
176  CostModel *cost_model,
177  Statistics &stats,
178  bool verbose = false);
179 
180  // Make a child copy of this state. The loop nest is const (we
181  // make mutated copies of it, rather than mutating it), so we can
182  // continue to point to the same one and so this is a cheap
183  // operation.
185 
186  void dump() const;
187 
189 
191  Stage &stage,
192  const vector<VarOrRVar> &parallel_vars,
193  const vector<int64_t> &parallel_extents,
194  const vector<int> &constant_extents) const;
195 
197  Stage &stage,
198  const vector<VarOrRVar> &parallel_vars,
199  const vector<int64_t> &parallel_extents) const;
200 
202  Stage &stage,
203  std::unordered_set<std::string> &new_serial_vars,
204  std::ostringstream &staged_funcs_schedule_source) const;
205 
206  bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
207 
208  // Apply the schedule represented by this state to a Halide
209  // Pipeline. Also generate source code for the schedule for the
210  // user to copy-paste to freeze this schedule as permanent artifact.
211  void apply_schedule(const FunctionDAG &dag,
212  const Anderson2021Params &params,
213  const Target &target);
214 
218 
220  const LoopNestMap &parent,
221  const FunctionDAG::Node &node,
222  const LoopNest *loop,
223  const LoopNest *root,
224  StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
226  const LoopNest *loop) const;
227 };
228 
229 // A priority queue of states, sorted according to increasing
230 // cost. Never shrinks, to avoid reallocations.
231 // Can't use std::priority_queue because it doesn't support unique_ptr.
232 class StateQueue {
233 private:
234  struct CompareStates {
235  bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
236  return a->cost > b->cost;
237  }
238  };
239 
240  std::vector<IntrusivePtr<State>> storage;
241  size_t sz = 0;
242 
243 public:
245  if (sz >= storage.size()) {
246  storage.resize(std::max(sz * 2, (size_t)64));
247  }
248  internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
249  storage[sz] = std::move(s);
250  sz++;
251  std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
252  }
253 
255  internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
256  std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
257  sz--;
258  return std::move(storage[sz]);
259  }
260 
262  return storage[0];
263  }
264 
265  bool empty() const {
266  return sz == 0;
267  }
268 
269  size_t size() const {
270  return sz;
271  }
272 
273  void swap(StateQueue &other) noexcept {
274  storage.swap(other.storage);
275  std::swap(sz, other.sz);
276  }
277 
279  return storage[idx];
280  }
281 
282  void resort() {
283  std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
284  }
285 
286  void clear() {
287  for (size_t i = 0; i < sz; i++) {
288  storage[i] = IntrusivePtr<State>{};
289  }
290  sz = 0;
291  }
292 };
293 
294 } // namespace Autoscheduler
295 } // namespace Internal
296 } // namespace Halide
297 
298 #endif // STATE_H
#define internal_assert(c)
Definition: Errors.h:19
void emplace(IntrusivePtr< State > &&s)
Definition: State.h:244
const IntrusivePtr< State > & top()
Definition: State.h:261
void swap(StateQueue &other) noexcept
Definition: State.h:273
IntrusivePtr< State > operator[](int idx) const
Definition: State.h:278
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A single definition of a Func.
Definition: Func.h:69
map< const LoopNest *, pair< const LoopNest *, int > > LoopNestMap
Definition: State.h:65
constexpr int kLocalMemoryLimit
Definition: State.h:32
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition: State.h:50
ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
__SIZE_TYPE__ size_t
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
void operator()(LoopNest *new_loop_nest) const
Definition: State.h:45
void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const
const LoopNest * deepest_common_ancestor(const LoopNestMap &parent, const LoopNest *a, const LoopNest *b) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent, const LoopNest *loop) const
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
void operator=(const State &)=delete
void compute_loop_nest_parents(LoopNestMap &p, const LoopNest *here, int depth) const
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const LoopNestMap &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
void set_gpu_store_site(const LoopNestMap &parent, const LoopNest *loop, LoopNest::Sites &site) const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
Definition: State.h:81
IntrusivePtr< const State > parent
Definition: State.h:26
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
Definition: State.h:24
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition: State.h:106
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
IntrusivePtr< State > make_child() const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target)
std::vector< double > cost_per_stage
Definition: State.h:80
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:71
A struct representing a target machine and os to generate code for.
Definition: Target.h:19