Halide 13.0.2
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1#ifndef HALIDE_FUNC_H
2#define HALIDE_FUNC_H
3
4/** \file
5 *
6 * Defines Func - the front-end handle on a halide function, and related classes.
7 */
8
9#include "Argument.h"
10#include "Expr.h"
11#include "JITModule.h"
12#include "Module.h"
13#include "Param.h"
14#include "Pipeline.h"
15#include "RDom.h"
16#include "Target.h"
17#include "Tuple.h"
18#include "Var.h"
19
20#include <map>
21#include <utility>
22
23namespace Halide {
24
25class OutputImageParam;
26class ParamMap;
27
28/** A class that can represent Vars or RVars. Used for reorder calls
29 * which can accept a mix of either. */
30struct VarOrRVar {
31 VarOrRVar(const std::string &n, bool r)
32 : var(n), rvar(n), is_rvar(r) {
33 }
34 VarOrRVar(const Var &v)
35 : var(v), is_rvar(false) {
36 }
37 VarOrRVar(const RVar &r)
38 : rvar(r), is_rvar(true) {
39 }
40 VarOrRVar(const RDom &r)
41 : rvar(RVar(r)), is_rvar(true) {
42 }
43 template<int N>
45 : var(u), is_rvar(false) {
46 }
47
48 const std::string &name() const {
49 if (is_rvar) {
50 return rvar.name();
51 } else {
52 return var.name();
53 }
54 }
55
58 bool is_rvar;
59};
60
61class ImageParam;
62
63namespace Internal {
64class Function;
65struct Split;
66struct StorageDim;
67} // namespace Internal
68
69/** A single definition of a Func. May be a pure or update definition. */
70class Stage {
71 /** Reference to the Function this stage (or definition) belongs to. */
72 Internal::Function function;
73 Internal::Definition definition;
74 /** Indicate which stage the definition belongs to (0 for initial
75 * definition, 1 for first update, etc.). */
76 size_t stage_index;
77 /** Pure Vars of the Function (from the init definition). */
78 std::vector<Var> dim_vars;
79
80 void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81 void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82 void split(const std::string &old, const std::string &outer, const std::string &inner,
83 const Expr &factor, bool exact, TailStrategy tail);
84 void remove(const std::string &var);
85 Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
86
87 const std::vector<Internal::StorageDim> &storage_dims() const {
88 return function.schedule().storage_dims();
89 }
90
91 Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
92
93public:
95 : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96 internal_assert(definition.defined());
97 definition.schedule().touched() = true;
98
99 dim_vars.reserve(function.args().size());
100 for (const auto &arg : function.args()) {
101 dim_vars.emplace_back(arg);
102 }
103 internal_assert(definition.args().size() == dim_vars.size());
104 }
105
106 /** Return the current StageSchedule associated with this Stage. For
107 * introspection only: to modify schedule, use the Func interface. */
109 return definition.schedule();
110 }
111
112 /** Return a string describing the current var list taking into
113 * account all the splits, reorders, and tiles. */
114 std::string dump_argument_list() const;
115
116 /** Return the name of this stage, e.g. "f.update(2)" */
117 std::string name() const;
118
119 /** Calling rfactor() on an associative update definition a Func will split
120 * the update into an intermediate which computes the partial results and
121 * replaces the current update definition with a new definition which merges
122 * the partial results. If called on a init/pure definition, this will
123 * throw an error. rfactor() will automatically infer the associative reduction
124 * operator and identity of the operator. If it can't prove the operation
125 * is associative or if it cannot find an identity for that operator, this
126 * will throw an error. In addition, commutativity of the operator is required
127 * if rfactor() is called on the inner dimension but excluding the outer
128 * dimensions.
129 *
130 * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
131 * The rvars not listed in 'preserved' are removed from the original Func and
132 * are lifted to the intermediate Func. The remaining rvars (the ones in
133 * 'preserved') are made pure in the intermediate Func. The intermediate Func's
134 * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
135 * applied to the original Func's update definition. The loop order of the
136 * intermediate Func's update definition is the same as the original, although
137 * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
138 * intermediate Func's init definition from innermost to outermost is the args'
139 * order of the original Func's init definition followed by the new pure Vars.
140 *
141 * The intermediate Func also inherits storage order from the original Func
142 * with the new pure Vars added to the outermost.
143 *
144 * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
145 \code
146 f(x, y) = 0;
147 f(x, y) += g(r.x, r.y);
148 \endcode
149 * into a pipeline like this:
150 \code
151 f_intm(x, y, u) = 0;
152 f_intm(x, y, u) += g(r.x, u);
153
154 f(x, y) = 0;
155 f(x, y) += f_intm(x, y, r.y);
156 \endcode
157 *
158 * This has a variety of uses. You can use it to split computation of an associative reduction:
159 \code
160 f(x, y) = 10;
161 RDom r(0, 96);
162 f(x, y) = max(f(x, y), g(x, y, r.x));
163 f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
164 f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
165 \endcode
166 *
167 *, which is equivalent to:
168 \code
169 parallel for u = 0 to 11:
170 for y:
171 for x:
172 f_intm(x, y, u) = -inf
173 parallel for x:
174 for y:
175 parallel for u = 0 to 11:
176 for rxi = 0 to 7:
177 f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
178 for y:
179 for x:
180 f(x, y) = 10
181 parallel for x:
182 for y:
183 for rxo = 0 to 11:
184 f(x, y) = max(f(x, y), f_intm(x, y, rxo))
185 \endcode
186 *
187 */
188 // @{
189 Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
190 Func rfactor(const RVar &r, const Var &v);
191 // @}
192
193 /** Schedule the iteration over this stage to be fused with another
194 * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
195 * be computed AFTER 's' in the innermost fused dimension. There should not
196 * be any dependencies between those two fused stages. If either of the
197 * stages being fused is a stage of an extern Func, this will throw an error.
198 *
199 * Note that the two stages that are fused together should have the same
200 * exact schedule from the outermost to the innermost fused dimension, and
201 * the stage we are calling compute_with on should not have specializations,
202 * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
203 *
204 * Also, if a producer is desired to be computed at the fused loop level,
205 * the function passed to the compute_at() needs to be the "parent". Consider
206 * the following code:
207 \code
208 input(x, y) = x + y;
209 f(x, y) = input(x, y);
210 f(x, y) += 5;
211 g(x, y) = x - y;
212 g(x, y) += 10;
213 f.compute_with(g, y);
214 f.update().compute_with(g.update(), y);
215 \endcode
216 *
217 * To compute 'input' at the fused loop level at dimension y, we specify
218 * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
219 * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
220 * is computed). On the other hand, to compute 'input' at the innermost
221 * dimension of 'f', we specify input.compute_at(f, x) instead of
222 * input.compute_at(g, x) since the x dimension of 'f' is not fused
223 * (only the y dimension is).
224 *
225 * Given the constraints, this has a variety of uses. Consider the
226 * following code:
227 \code
228 f(x, y) = x + y;
229 g(x, y) = x - y;
230 h(x, y) = f(x, y) + g(x, y);
231 f.compute_root();
232 g.compute_root();
233 f.split(x, xo, xi, 8);
234 g.split(x, xo, xi, 8);
235 g.compute_with(f, xo);
236 \endcode
237 *
238 * This is equivalent to:
239 \code
240 for y:
241 for xo:
242 for xi:
243 f(8*xo + xi) = (8*xo + xi) + y
244 for xi:
245 g(8*xo + xi) = (8*xo + xi) - y
246 for y:
247 for x:
248 h(x, y) = f(x, y) + g(x, y)
249 \endcode
250 *
251 * The size of the dimensions of the stages computed_with do not have
252 * to match. Consider the following code where 'g' is half the size of 'f':
253 \code
254 Image<int> f_im(size, size), g_im(size/2, size/2);
255 input(x, y) = x + y;
256 f(x, y) = input(x, y);
257 g(x, y) = input(2*x, 2*y);
258 g.compute_with(f, y);
259 input.compute_at(f, y);
260 Pipeline({f, g}).realize({f_im, g_im});
261 \endcode
262 *
263 * This is equivalent to:
264 \code
265 for y = 0 to size-1:
266 for x = 0 to size-1:
267 input(x, y) = x + y;
268 for x = 0 to size-1:
269 f(x, y) = input(x, y)
270 for x = 0 to size/2-1:
271 if (y < size/2-1):
272 g(x, y) = input(2*x, 2*y)
273 \endcode
274 *
275 * 'align' specifies how the loop iteration of each dimension of the
276 * two stages being fused should be aligned in the fused loop nests
277 * (see LoopAlignStrategy for options). Consider the following loop nests:
278 \code
279 for z = f_min_z to f_max_z:
280 for y = f_min_y to f_max_y:
281 for x = f_min_x to f_max_x:
282 f(x, y, z) = x + y + z
283 for z = g_min_z to g_max_z:
284 for y = g_min_y to g_max_y:
285 for x = g_min_x to g_max_x:
286 g(x, y, z) = x - y - z
287 \endcode
288 *
289 * If no alignment strategy is specified, the following loop nest will be
290 * generated:
291 \code
292 for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
293 for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
294 for x = f_min_x to f_max_x:
295 if (f_min_z <= z <= f_max_z):
296 if (f_min_y <= y <= f_max_y):
297 f(x, y, z) = x + y + z
298 for x = g_min_x to g_max_x:
299 if (g_min_z <= z <= g_max_z):
300 if (g_min_y <= y <= g_max_y):
301 g(x, y, z) = x - y - z
302 \endcode
303 *
304 * Instead, these alignment strategies:
305 \code
306 g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
307 \endcode
308 * will produce the following loop nest:
309 \code
310 f_loop_min_z = f_min_z
311 f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
312 for z = f_min_z to f_loop_max_z:
313 f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
314 f_loop_max_y = f_max_y
315 for y = f_loop_min_y to f_loop_max_y:
316 for x = f_min_x to f_max_x:
317 if (f_loop_min_z <= z <= f_loop_max_z):
318 if (f_loop_min_y <= y <= f_loop_max_y):
319 f(x, y, z) = x + y + z
320 for x = g_min_x to g_max_x:
321 g_shift_z = g_min_z - f_loop_min_z
322 g_shift_y = g_max_y - f_loop_max_y
323 if (g_min_z <= (z + g_shift_z) <= g_max_z):
324 if (g_min_y <= (y + g_shift_y) <= g_max_y):
325 g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
326 \endcode
327 *
328 * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
329 * of 'g' at dimension z so that its starting value matches that of 'f'.
330 * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
331 * iteration of 'g' at dimension y so that its end value matches that of 'f'.
332 */
333 // @{
334 Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
336 Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
338 // @}
339
340 /** Scheduling calls that control how the domain of this stage is
341 * traversed. See the documentation for Func for the meanings. */
342 // @{
343
344 Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
345 Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
346 Stage &serial(const VarOrRVar &var);
349 Stage &unroll(const VarOrRVar &var);
350 Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
351 Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352 Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
353 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
354 const VarOrRVar &xo, const VarOrRVar &yo,
355 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
357 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
358 const VarOrRVar &xi, const VarOrRVar &yi,
359 const Expr &xfactor, const Expr &yfactor,
361 Stage &tile(const std::vector<VarOrRVar> &previous,
362 const std::vector<VarOrRVar> &outers,
363 const std::vector<VarOrRVar> &inners,
364 const std::vector<Expr> &factors,
365 const std::vector<TailStrategy> &tails);
366 Stage &tile(const std::vector<VarOrRVar> &previous,
367 const std::vector<VarOrRVar> &outers,
368 const std::vector<VarOrRVar> &inners,
369 const std::vector<Expr> &factors,
371 Stage &tile(const std::vector<VarOrRVar> &previous,
372 const std::vector<VarOrRVar> &inners,
373 const std::vector<Expr> &factors,
375 Stage &reorder(const std::vector<VarOrRVar> &vars);
376
377 template<typename... Args>
378 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
379 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
380 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
381 return reorder(collected_args);
382 }
383
384 Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
385 Stage specialize(const Expr &condition);
386 void specialize_fail(const std::string &message);
387
389 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
390 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
391
393
395
397 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
398 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
399
400 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
401 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
402 const VarOrRVar &thread_x, const VarOrRVar &thread_y,
403 DeviceAPI device_api = DeviceAPI::Default_GPU);
404 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
405 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
406 DeviceAPI device_api = DeviceAPI::Default_GPU);
407
408 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
410 DeviceAPI device_api = DeviceAPI::Default_GPU);
411
412 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
414 DeviceAPI device_api = DeviceAPI::Default_GPU);
415 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
416 const VarOrRVar &bx, const VarOrRVar &by,
417 const VarOrRVar &tx, const VarOrRVar &ty,
418 const Expr &x_size, const Expr &y_size,
420 DeviceAPI device_api = DeviceAPI::Default_GPU);
421
422 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
423 const VarOrRVar &tx, const VarOrRVar &ty,
424 const Expr &x_size, const Expr &y_size,
426 DeviceAPI device_api = DeviceAPI::Default_GPU);
427
428 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
429 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
430 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
431 const Expr &x_size, const Expr &y_size, const Expr &z_size,
433 DeviceAPI device_api = DeviceAPI::Default_GPU);
434 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
435 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
436 const Expr &x_size, const Expr &y_size, const Expr &z_size,
438 DeviceAPI device_api = DeviceAPI::Default_GPU);
439
441 Stage &atomic(bool override_associativity_test = false);
442
444
445 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
446 Stage &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
447 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
448 return prefetch(f, var, var, offset, strategy);
449 }
450 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
451 Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
452 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
453 return prefetch(param, var, var, offset, strategy);
454 }
455 template<typename T>
456 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
457 Stage &prefetch(const T &image, VarOrRVar var, int offset = 1,
458 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
459 return prefetch(image.parameter(), var, var, offset, strategy);
460 }
461 Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
463 Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
465 template<typename T>
466 Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
468 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
469 }
470 // @}
471
472 /** Attempt to get the source file and line where this stage was
473 * defined by parsing the process's own debug symbols. Returns an
474 * empty string if no debug symbols were found or the debug
475 * symbols were not understood. Works on OS X and Linux only. */
476 std::string source_location() const;
477};
478
479// For backwards compatibility, keep the ScheduleHandle name.
481
483
484/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
485 * z are Vars or Exprs. If could be the left hand side of a definition or
486 * an update definition, or it could be a call to a function. We don't know
487 * until we see how this object gets used.
488 */
489class FuncRef {
491 int implicit_placeholder_pos;
492 int implicit_count;
493 std::vector<Expr> args;
494 std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
495
496 /** Helper for function update by Tuple. If the function does not
497 * already have a pure definition, init_val will be used as RHS of
498 * each tuple element in the initial function definition. */
499 template<typename BinaryOp>
500 Stage func_ref_update(const Tuple &e, int init_val);
501
502 /** Helper for function update by Expr. If the function does not
503 * already have a pure definition, init_val will be used as RHS in
504 * the initial function definition. */
505 template<typename BinaryOp>
506 Stage func_ref_update(Expr e, int init_val);
507
508public:
509 FuncRef(const Internal::Function &, const std::vector<Expr> &,
510 int placeholder_pos = -1, int count = 0);
511 FuncRef(Internal::Function, const std::vector<Var> &,
512 int placeholder_pos = -1, int count = 0);
513
514 /** Use this as the left-hand-side of a definition or an update definition
515 * (see \ref RDom).
516 */
518
519 /** Use this as the left-hand-side of a definition or an update definition
520 * for a Func with multiple outputs. */
522
523 /** Define a stage that adds the given expression to this Func. If the
524 * expression refers to some RDom, this performs a sum reduction of the
525 * expression over the domain. If the function does not already have a
526 * pure definition, this sets it to zero.
527 */
528 // @{
532 // @}
533
534 /** Define a stage that adds the negative of the given expression to this
535 * Func. If the expression refers to some RDom, this performs a sum reduction
536 * of the negative of the expression over the domain. If the function does
537 * not already have a pure definition, this sets it to zero.
538 */
539 // @{
543 // @}
544
545 /** Define a stage that multiplies this Func by the given expression. If the
546 * expression refers to some RDom, this performs a product reduction of the
547 * expression over the domain. If the function does not already have a pure
548 * definition, this sets it to 1.
549 */
550 // @{
554 // @}
555
556 /** Define a stage that divides this Func by the given expression.
557 * If the expression refers to some RDom, this performs a product
558 * reduction of the inverse of the expression over the domain. If the
559 * function does not already have a pure definition, this sets it to 1.
560 */
561 // @{
565 // @}
566
567 /* Override the usual assignment operator, so that
568 * f(x, y) = g(x, y) defines f.
569 */
571
572 /** Use this as a call to the function, and not the left-hand-side
573 * of a definition. Only works for single-output Funcs. */
574 operator Expr() const;
575
576 /** When a FuncRef refers to a function that provides multiple
577 * outputs, you can access each output as an Expr using
578 * operator[].
579 */
581
582 /** How many outputs does the function this refers to produce. */
583 size_t size() const;
584
585 /** What function is this calling? */
586 Internal::Function function() const {
587 return func;
588 }
589};
590
591/** Explicit overloads of min and max for FuncRef. These exist to
592 * disambiguate calls to min on FuncRefs when a user has pulled both
593 * Halide::min and std::min into their namespace. */
594// @{
595inline Expr min(const FuncRef &a, const FuncRef &b) {
596 return min(Expr(a), Expr(b));
597}
598inline Expr max(const FuncRef &a, const FuncRef &b) {
599 return max(Expr(a), Expr(b));
600}
601// @}
602
603/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
604 * z are Vars or Exprs. If could be the left hand side of an update
605 * definition, or it could be a call to a function. We don't know
606 * until we see how this object gets used.
607 */
609 FuncRef func_ref;
610 std::vector<Expr> args; // args to the function
611 int idx; // Index to function outputs
612
613 /** Helper function that generates a Tuple where element at 'idx' is set
614 * to 'e' and the rests are undef. */
615 Tuple values_with_undefs(const Expr &e) const;
616
617public:
618 FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
619
620 /** Use this as the left-hand-side of an update definition of Tuple
621 * component 'idx' of a Func (see \ref RDom). The function must
622 * already have an initial definition.
623 */
625
626 /** Define a stage that adds the given expression to Tuple component 'idx'
627 * of this Func. The other Tuple components are unchanged. If the expression
628 * refers to some RDom, this performs a sum reduction of the expression over
629 * the domain. The function must already have an initial definition.
630 */
632
633 /** Define a stage that adds the negative of the given expression to Tuple
634 * component 'idx' of this Func. The other Tuple components are unchanged.
635 * If the expression refers to some RDom, this performs a sum reduction of
636 * the negative of the expression over the domain. The function must already
637 * have an initial definition.
638 */
640
641 /** Define a stage that multiplies Tuple component 'idx' of this Func by
642 * the given expression. The other Tuple components are unchanged. If the
643 * expression refers to some RDom, this performs a product reduction of
644 * the expression over the domain. The function must already have an
645 * initial definition.
646 */
648
649 /** Define a stage that divides Tuple component 'idx' of this Func by
650 * the given expression. The other Tuple components are unchanged.
651 * If the expression refers to some RDom, this performs a product
652 * reduction of the inverse of the expression over the domain. The function
653 * must already have an initial definition.
654 */
656
657 /* Override the usual assignment operator, so that
658 * f(x, y)[index] = g(x, y) defines f.
659 */
661
662 /** Use this as a call to Tuple component 'idx' of a Func, and not the
663 * left-hand-side of a definition. */
664 operator Expr() const;
665
666 /** What function is this calling? */
667 Internal::Function function() const {
668 return func_ref.function();
669 }
670
671 /** Return index to the function outputs. */
672 int index() const {
673 return idx;
674 }
675};
676
677namespace Internal {
678class IRMutator;
679} // namespace Internal
680
681/** Helper class for identifying purpose of an Expr passed to memoize.
682 */
684protected:
686 friend class Func;
687
688public:
689 explicit EvictionKey(const Expr &expr = Expr())
690 : key(expr) {
691 }
692};
693
694/** A halide function. This class represents one stage in a Halide
695 * pipeline, and is the unit by which we schedule things. By default
696 * they are aggressively inlined, so you are encouraged to make lots
697 * of little functions, rather than storing things in Exprs. */
698class Func {
699
700 /** A handle on the internal halide function that this
701 * represents */
703
704 /** When you make a reference to this function with fewer
705 * arguments than it has dimensions, the argument list is bulked
706 * up with 'implicit' vars with canonical names. This lets you
707 * pass around partially applied Halide functions. */
708 // @{
709 std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
710 std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
711 // @}
712
713 /** The imaging pipeline that outputs this Func alone. */
714 Pipeline pipeline_;
715
716 /** Get the imaging pipeline that outputs this Func alone,
717 * creating it (and freezing the Func) if necessary. */
718 Pipeline pipeline();
719
720 // Helper function for recursive reordering support
721 Func &reorder_storage(const std::vector<Var> &dims, size_t start);
722
723 void invalidate_cache();
724
725public:
726 /** Declare a new undefined function with the given name */
727 explicit Func(const std::string &name);
728
729 /** Declare a new undefined function with an
730 * automatically-generated unique name */
732
733 /** Declare a new function with an automatically-generated unique
734 * name, and define it to return the given expression (which may
735 * not contain free variables). */
736 explicit Func(const Expr &e);
737
738 /** Construct a new Func to wrap an existing, already-define
739 * Function object. */
741
742 /** Construct a new Func to wrap a Buffer. */
743 template<typename T>
745 : Func() {
746 (*this)(_) = im(_);
747 }
748
749 /** Evaluate this function over some rectangular domain and return
750 * the resulting buffer or buffers. Performs compilation if the
751 * Func has not previously been realized and compile_jit has not
752 * been called. If the final stage of the pipeline is on the GPU,
753 * data is copied back to the host before being returned. The
754 * returned Realization should probably be instantly converted to
755 * a Buffer class of the appropriate type. That is, do this:
756 *
757 \code
758 f(x) = sin(x);
759 Buffer<float> im = f.realize(...);
760 \endcode
761 *
762 * If your Func has multiple values, because you defined it using
763 * a Tuple, then casting the result of a realize call to a buffer
764 * or image will produce a run-time error. Instead you should do the
765 * following:
766 *
767 \code
768 f(x) = Tuple(x, sin(x));
769 Realization r = f.realize(...);
770 Buffer<int> im0 = r[0];
771 Buffer<float> im1 = r[1];
772 \endcode
773 *
774 * In Halide formal arguments of a computation are specified using
775 * Param<T> and ImageParam objects in the expressions defining the
776 * computation. The param_map argument to realize allows
777 * specifying a set of per-call parameters to be used for a
778 * specific computation. This method is thread-safe where the
779 * globals used by Param<T> and ImageParam are not. Any parameters
780 * that are not in the param_map are taken from the global values,
781 * so those can continue to be used if they are not changing
782 * per-thread.
783 *
784 * One can explicitly construct a ParamMap and
785 * use its set method to insert Parameter to scalar or Buffer
786 * value mappings:
787 *
788 \code
789 Param<int32> p(42);
790 ImageParam img(Int(32), 1);
791 f(x) = img(x) + p;
792
793 Buffer<int32_t) arg_img(10, 10);
794 <fill in arg_img...>
795 ParamMap params;
796 params.set(p, 17);
797 params.set(img, arg_img);
798
799 Target t = get_jit_target_from_environment();
800 Buffer<int32_t> result = f.realize({10, 10}, t, params);
801 \endcode
802 *
803 * Alternatively, an initializer list can be used
804 * directly in the realize call to pass this information:
805 *
806 \code
807 Param<int32> p(42);
808 ImageParam img(Int(32), 1);
809 f(x) = img(x) + p;
810
811 Buffer<int32_t) arg_img(10, 10);
812 <fill in arg_img...>
813
814 Target t = get_jit_target_from_environment();
815 Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
816 \endcode
817 *
818 * If the Func cannot be realized into a buffer of the given size
819 * due to scheduling constraints on scattering update definitions,
820 * it will be realized into a larger buffer of the minimum size
821 * possible, and a cropped view at the requested size will be
822 * returned. It is thus not safe to assume the returned buffers
823 * are contiguous in memory. This behavior can be disabled with
824 * the NoBoundsQuery target flag, in which case an error about
825 * writing out of bounds on the output buffer will trigger
826 * instead.
827 *
828 */
829 Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
830 const ParamMap &param_map = ParamMap::empty_map());
831
832 /** Evaluate this function into an existing allocated buffer or
833 * buffers. If the buffer is also one of the arguments to the
834 * function, strange things may happen, as the pipeline isn't
835 * necessarily safe to run in-place. If you pass multiple buffers,
836 * they must have matching sizes. This form of realize does *not*
837 * automatically copy data back from the GPU. */
839 const ParamMap &param_map = ParamMap::empty_map());
840
841 /** For a given size of output, or a given output buffer,
842 * determine the bounds required of all unbound ImageParams
843 * referenced. Communicates the result by allocating new buffers
844 * of the appropriate size and binding them to the unbound
845 * ImageParams.
846 *
847 * Set the documentation for Func::realize regarding the
848 * ParamMap. There is one difference in that input Buffer<>
849 * arguments that are being inferred are specified as a pointer to
850 * the Buffer<> in the ParamMap. E.g.
851 *
852 \code
853 Param<int32> p(42);
854 ImageParam img(Int(32), 1);
855 f(x) = img(x) + p;
856
857 Target t = get_jit_target_from_environment();
858 Buffer<> in;
859 f.infer_input_bounds({10, 10}, t, { { img, &in } });
860 \endcode
861 * On return, in will be an allocated buffer of the correct size
862 * to evaulate f over a 10x10 region.
863 */
864 // @{
865 void infer_input_bounds(const std::vector<int32_t> &sizes,
866 const Target &target = get_jit_target_from_environment(),
867 const ParamMap &param_map = ParamMap::empty_map());
869 const Target &target = get_jit_target_from_environment(),
870 const ParamMap &param_map = ParamMap::empty_map());
871 // @}
872
873 /** Statically compile this function to llvm bitcode, with the
874 * given filename (which should probably end in .bc), type
875 * signature, and C function name (which defaults to the same name
876 * as this halide function */
877 //@{
878 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
879 const Target &target = get_target_from_environment());
880 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
881 const Target &target = get_target_from_environment());
882 // @}
883
884 /** Statically compile this function to llvm assembly, with the
885 * given filename (which should probably end in .ll), type
886 * signature, and C function name (which defaults to the same name
887 * as this halide function */
888 //@{
889 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
890 const Target &target = get_target_from_environment());
891 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
892 const Target &target = get_target_from_environment());
893 // @}
894
895 /** Statically compile this function to an object file, with the
896 * given filename (which should probably end in .o or .obj), type
897 * signature, and C function name (which defaults to the same name
898 * as this halide function. You probably don't want to use this
899 * directly; call compile_to_static_library or compile_to_file instead. */
900 //@{
901 void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
902 const Target &target = get_target_from_environment());
903 void compile_to_object(const std::string &filename, const std::vector<Argument> &,
904 const Target &target = get_target_from_environment());
905 // @}
906
907 /** Emit a header file with the given filename for this
908 * function. The header will define a function with the type
909 * signature given by the second argument, and a name given by the
910 * third. The name defaults to the same name as this halide
911 * function. You don't actually have to have defined this function
912 * yet to call this. You probably don't want to use this directly;
913 * call compile_to_static_library or compile_to_file instead. */
914 void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
915 const Target &target = get_target_from_environment());
916
917 /** Statically compile this function to text assembly equivalent
918 * to the object file generated by compile_to_object. This is
919 * useful for checking what Halide is producing without having to
920 * disassemble anything, or if you need to feed the assembly into
921 * some custom toolchain to produce an object file (e.g. iOS) */
922 //@{
923 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
924 const Target &target = get_target_from_environment());
925 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
926 const Target &target = get_target_from_environment());
927 // @}
928
929 /** Statically compile this function to C source code. This is
930 * useful for providing fallback code paths that will compile on
931 * many platforms. Vectorization will fail, and parallelization
932 * will produce serial code. */
933 void compile_to_c(const std::string &filename,
934 const std::vector<Argument> &,
935 const std::string &fn_name = "",
936 const Target &target = get_target_from_environment());
937
938 /** Write out an internal representation of lowered code. Useful
939 * for analyzing and debugging scheduling. Can emit html or plain
940 * text. */
941 void compile_to_lowered_stmt(const std::string &filename,
942 const std::vector<Argument> &args,
944 const Target &target = get_target_from_environment());
945
946 /** Write out the loop nests specified by the schedule for this
947 * Function. Helpful for understanding what a schedule is
948 * doing. */
950
951 /** Compile to object file and header pair, with the given
952 * arguments. The name defaults to the same name as this halide
953 * function.
954 */
955 void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
956 const std::string &fn_name = "",
957 const Target &target = get_target_from_environment());
958
959 /** Compile to static-library file and header pair, with the given
960 * arguments. The name defaults to the same name as this halide
961 * function.
962 */
963 void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
964 const std::string &fn_name = "",
965 const Target &target = get_target_from_environment());
966
967 /** Compile to static-library file and header pair once for each target;
968 * each resulting function will be considered (in order) via halide_can_use_target_features()
969 * at runtime, with the first appropriate match being selected for subsequent use.
970 * This is typically useful for specializations that may vary unpredictably by machine
971 * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
972 * All targets must have identical arch-os-bits.
973 */
974 void compile_to_multitarget_static_library(const std::string &filename_prefix,
975 const std::vector<Argument> &args,
976 const std::vector<Target> &targets);
977
978 /** Like compile_to_multitarget_static_library(), except that the object files
979 * are all output as object files (rather than bundled into a static library).
980 *
981 * `suffixes` is an optional list of strings to use for as the suffix for each object
982 * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
983 * will be used for each suffix.)
984 *
985 * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
986 * will be generated with the filename `${filename_prefix}_wrapper.o`
987 *
988 * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
989 * will be generated with the filename `${filename_prefix}_runtime.o`
990 */
991 void compile_to_multitarget_object_files(const std::string &filename_prefix,
992 const std::vector<Argument> &args,
993 const std::vector<Target> &targets,
994 const std::vector<std::string> &suffixes);
995
996 /** Store an internal representation of lowered code as a self
997 * contained Module suitable for further compilation. */
998 Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
999 const Target &target = get_target_from_environment());
1000
1001 /** Compile and generate multiple target files with single call.
1002 * Deduces target files based on filenames specified in
1003 * output_files map.
1004 */
1005 void compile_to(const std::map<Output, std::string> &output_files,
1006 const std::vector<Argument> &args,
1007 const std::string &fn_name,
1008 const Target &target = get_target_from_environment());
1009
1010 /** Eagerly jit compile the function to machine code. This
1011 * normally happens on the first call to realize. If you're
1012 * running your halide pipeline inside time-sensitive code and
1013 * wish to avoid including the time taken to compile a pipeline,
1014 * then you can call this ahead of time. Default is to use the Target
1015 * returned from Halide::get_jit_target_from_environment()
1016 */
1018
1019 /** Set the error handler function that be called in the case of
1020 * runtime errors during halide pipelines. If you are compiling
1021 * statically, you can also just define your own function with
1022 * signature
1023 \code
1024 extern "C" void halide_error(void *user_context, const char *);
1025 \endcode
1026 * This will clobber Halide's version.
1027 */
1028 void set_error_handler(void (*handler)(void *, const char *));
1029
1030 /** Set a custom malloc and free for halide to use. Malloc should
1031 * return 32-byte aligned chunks of memory, and it should be safe
1032 * for Halide to read slightly out of bounds (up to 8 bytes before
1033 * the start or beyond the end). If compiling statically, routines
1034 * with appropriate signatures can be provided directly
1035 \code
1036 extern "C" void *halide_malloc(void *, size_t)
1037 extern "C" void halide_free(void *, void *)
1038 \endcode
1039 * These will clobber Halide's versions. See HalideRuntime.h
1040 * for declarations.
1041 */
1042 void set_custom_allocator(void *(*malloc)(void *, size_t),
1043 void (*free)(void *, void *));
1044
1045 /** Set a custom task handler to be called by the parallel for
1046 * loop. It is useful to set this if you want to do some
1047 * additional bookkeeping at the granularity of parallel
1048 * tasks. The default implementation does this:
1049 \code
1050 extern "C" int halide_do_task(void *user_context,
1051 int (*f)(void *, int, uint8_t *),
1052 int idx, uint8_t *state) {
1053 return f(user_context, idx, state);
1054 }
1055 \endcode
1056 * If you are statically compiling, you can also just define your
1057 * own version of the above function, and it will clobber Halide's
1058 * version.
1059 *
1060 * If you're trying to use a custom parallel runtime, you probably
1061 * don't want to call this. See instead \ref Func::set_custom_do_par_for .
1062 */
1064 int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
1065 int, uint8_t *));
1066
1067 /** Set a custom parallel for loop launcher. Useful if your app
1068 * already manages a thread pool. The default implementation is
1069 * equivalent to this:
1070 \code
1071 extern "C" int halide_do_par_for(void *user_context,
1072 int (*f)(void *, int, uint8_t *),
1073 int min, int extent, uint8_t *state) {
1074 int exit_status = 0;
1075 parallel for (int idx = min; idx < min+extent; idx++) {
1076 int job_status = halide_do_task(user_context, f, idx, state);
1077 if (job_status) exit_status = job_status;
1078 }
1079 return exit_status;
1080 }
1081 \endcode
1082 *
1083 * However, notwithstanding the above example code, if one task
1084 * fails, we may skip over other tasks, and if two tasks return
1085 * different error codes, we may select one arbitrarily to return.
1086 *
1087 * If you are statically compiling, you can also just define your
1088 * own version of the above function, and it will clobber Halide's
1089 * version.
1090 */
1092 int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
1093 int, uint8_t *));
1094
1095 /** Set custom routines to call when tracing is enabled. Call this
1096 * on the output Func of your pipeline. This then sets custom
1097 * routines for the entire pipeline, not just calls to this
1098 * Func.
1099 *
1100 * If you are statically compiling, you can also just define your
1101 * own versions of the tracing functions (see HalideRuntime.h),
1102 * and they will clobber Halide's versions. */
1103 void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
1104
1105 /** Set the function called to print messages from the runtime.
1106 * If you are compiling statically, you can also just define your
1107 * own function with signature
1108 \code
1109 extern "C" void halide_print(void *user_context, const char *);
1110 \endcode
1111 * This will clobber Halide's version.
1112 */
1113 void set_custom_print(void (*handler)(void *, const char *));
1114
1115 /** Get a struct containing the currently set custom functions
1116 * used by JIT. */
1118
1119 /** Add a custom pass to be used during lowering. It is run after
1120 * all other lowering passes. Can be used to verify properties of
1121 * the lowered Stmt, instrument it with extra code, or otherwise
1122 * modify it. The Func takes ownership of the pass, and will call
1123 * delete on it when the Func goes out of scope. So don't pass a
1124 * stack object, or share pass instances between multiple
1125 * Funcs. */
1126 template<typename T>
1128 // Template instantiate a custom deleter for this type, then
1129 // wrap in a lambda. The custom deleter lives in user code, so
1130 // that deletion is on the same heap as construction (I hate Windows).
1131 add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1132 }
1133
1134 /** Add a custom pass to be used during lowering, with the
1135 * function that will be called to delete it also passed in. Set
1136 * it to nullptr if you wish to retain ownership of the object. */
1137 void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1138
1139 /** Remove all previously-set custom lowering passes */
1141
1142 /** Get the custom lowering passes. */
1143 const std::vector<CustomLoweringPass> &custom_lowering_passes();
1144
1145 /** When this function is compiled, include code that dumps its
1146 * values to a file after it is realized, for the purpose of
1147 * debugging.
1148 *
1149 * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1150 * is in TIFF format and can be read by standard tools. Oherwise, the
1151 * file format is as follows:
1152 *
1153 * All data is in the byte-order of the target platform. First, a
1154 * 20 byte-header containing four 32-bit ints, giving the extents
1155 * of the first four dimensions. Dimensions beyond four are
1156 * folded into the fourth. Then, a fifth 32-bit int giving the
1157 * data type of the function. The typecodes are given by: float =
1158 * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1159 * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1160 * data follows the header, as a densely packed array of the given
1161 * size and the given type. If given the extension .tmp, this file
1162 * format can be natively read by the program ImageStack. */
1163 void debug_to_file(const std::string &filename);
1164
1165 /** The name of this function, either given during construction,
1166 * or automatically generated. */
1167 const std::string &name() const;
1168
1169 /** Get the pure arguments. */
1170 std::vector<Var> args() const;
1171
1172 /** The right-hand-side value of the pure definition of this
1173 * function. Causes an error if there's no pure definition, or if
1174 * the function is defined to return multiple values. */
1175 Expr value() const;
1176
1177 /** The values returned by this function. An error if the function
1178 * has not been been defined. Returns a Tuple with one element for
1179 * functions defined to return a single value. */
1180 Tuple values() const;
1181
1182 /** Does this function have at least a pure definition. */
1183 bool defined() const;
1184
1185 /** Get the left-hand-side of the update definition. An empty
1186 * vector if there's no update definition. If there are
1187 * multiple update definitions for this function, use the
1188 * argument to select which one you want. */
1189 const std::vector<Expr> &update_args(int idx = 0) const;
1190
1191 /** Get the right-hand-side of an update definition. An error if
1192 * there's no update definition. If there are multiple
1193 * update definitions for this function, use the argument to
1194 * select which one you want. */
1195 Expr update_value(int idx = 0) const;
1196
1197 /** Get the right-hand-side of an update definition for
1198 * functions that returns multiple values. An error if there's no
1199 * update definition. Returns a Tuple with one element for
1200 * functions that return a single value. */
1201 Tuple update_values(int idx = 0) const;
1202
1203 /** Get the RVars of the reduction domain for an update definition, if there is
1204 * one. */
1205 std::vector<RVar> rvars(int idx = 0) const;
1206
1207 /** Does this function have at least one update definition? */
1209
1210 /** How many update definitions does this function have? */
1212
1213 /** Is this function an external stage? That is, was it defined
1214 * using define_extern? */
1215 bool is_extern() const;
1216
1217 /** Add an extern definition for this Func. This lets you define a
1218 * Func that represents an external pipeline stage. You can, for
1219 * example, use it to wrap a call to an extern library such as
1220 * fftw. */
1221 // @{
1222 void define_extern(const std::string &function_name,
1223 const std::vector<ExternFuncArgument> &params, Type t,
1224 int dimensionality,
1226 DeviceAPI device_api = DeviceAPI::Host) {
1227 define_extern(function_name, params, t,
1228 Internal::make_argument_list(dimensionality), mangling,
1229 device_api);
1230 }
1231
1232 void define_extern(const std::string &function_name,
1233 const std::vector<ExternFuncArgument> &params,
1234 const std::vector<Type> &types, int dimensionality,
1235 NameMangling mangling) {
1236 define_extern(function_name, params, types,
1237 Internal::make_argument_list(dimensionality), mangling);
1238 }
1239
1240 void define_extern(const std::string &function_name,
1241 const std::vector<ExternFuncArgument> &params,
1242 const std::vector<Type> &types, int dimensionality,
1244 DeviceAPI device_api = DeviceAPI::Host) {
1245 define_extern(function_name, params, types,
1246 Internal::make_argument_list(dimensionality), mangling,
1247 device_api);
1248 }
1249
1250 void define_extern(const std::string &function_name,
1251 const std::vector<ExternFuncArgument> &params, Type t,
1252 const std::vector<Var> &arguments,
1254 DeviceAPI device_api = DeviceAPI::Host) {
1255 define_extern(function_name, params, std::vector<Type>{t}, arguments,
1256 mangling, device_api);
1257 }
1258
1259 void define_extern(const std::string &function_name,
1260 const std::vector<ExternFuncArgument> &params,
1261 const std::vector<Type> &types,
1262 const std::vector<Var> &arguments,
1264 DeviceAPI device_api = DeviceAPI::Host);
1265 // @}
1266
1267 /** Get the types of the outputs of this Func. */
1268 const std::vector<Type> &output_types() const;
1269
1270 /** Get the number of outputs of this Func. Corresponds to the
1271 * size of the Tuple this Func was defined to return. */
1272 int outputs() const;
1273
1274 /** Get the name of the extern function called for an extern
1275 * definition. */
1276 const std::string &extern_function_name() const;
1277
1278 /** The dimensionality (number of arguments) of this
1279 * function. Zero if the function is not yet defined. */
1280 int dimensions() const;
1281
1282 /** Construct either the left-hand-side of a definition, or a call
1283 * to a functions that happens to only contain vars as
1284 * arguments. If the function has already been defined, and fewer
1285 * arguments are given than the function has dimensions, then
1286 * enough implicit vars are added to the end of the argument list
1287 * to make up the difference (see \ref Var::implicit) */
1288 // @{
1289 FuncRef operator()(std::vector<Var>) const;
1290
1291 template<typename... Args>
1292 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1293 operator()(Args &&...args) const {
1294 std::vector<Var> collected_args{std::forward<Args>(args)...};
1295 return this->operator()(collected_args);
1296 }
1297 // @}
1298
1299 /** Either calls to the function, or the left-hand-side of
1300 * an update definition (see \ref RDom). If the function has
1301 * already been defined, and fewer arguments are given than the
1302 * function has dimensions, then enough implicit vars are added to
1303 * the end of the argument list to make up the difference. (see
1304 * \ref Var::implicit)*/
1305 // @{
1306 FuncRef operator()(std::vector<Expr>) const;
1307
1308 template<typename... Args>
1309 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1310 operator()(const Expr &x, Args &&...args) const {
1311 std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1312 return (*this)(collected_args);
1313 }
1314 // @}
1315
1316 /** Creates and returns a new identity Func that wraps this Func. During
1317 * compilation, Halide replaces all calls to this Func done by 'f'
1318 * with calls to the wrapper. If this Func is already wrapped for
1319 * use in 'f', will return the existing wrapper.
1320 *
1321 * For example, g.in(f) would rewrite a pipeline like this:
1322 \code
1323 g(x, y) = ...
1324 f(x, y) = ... g(x, y) ...
1325 \endcode
1326 * into a pipeline like this:
1327 \code
1328 g(x, y) = ...
1329 g_wrap(x, y) = g(x, y)
1330 f(x, y) = ... g_wrap(x, y)
1331 \endcode
1332 *
1333 * This has a variety of uses. You can use it to schedule this
1334 * Func differently in the different places it is used:
1335 \code
1336 g(x, y) = ...
1337 f1(x, y) = ... g(x, y) ...
1338 f2(x, y) = ... g(x, y) ...
1339 g.in(f1).compute_at(f1, y).vectorize(x, 8);
1340 g.in(f2).compute_at(f2, x).unroll(x);
1341 \endcode
1342 *
1343 * You can also use it to stage loads from this Func via some
1344 * intermediate buffer (perhaps on the stack as in
1345 * test/performance/block_transpose.cpp, or in shared GPU memory
1346 * as in test/performance/wrap.cpp). In this we compute the
1347 * wrapper at tiles of the consuming Funcs like so:
1348 \code
1349 g.compute_root()...
1350 g.in(f).compute_at(f, tiles)...
1351 \endcode
1352 *
1353 * Func::in() can also be used to compute pieces of a Func into a
1354 * smaller scratch buffer (perhaps on the GPU) and then copy them
1355 * into a larger output buffer one tile at a time. See
1356 * apps/interpolate/interpolate.cpp for an example of this. In
1357 * this case we compute the Func at tiles of its own wrapper:
1358 \code
1359 f.in(g).compute_root().gpu_tile(...)...
1360 f.compute_at(f.in(g), tiles)...
1361 \endcode
1362 *
1363 * A similar use of Func::in() wrapping Funcs with multiple update
1364 * stages in a pure wrapper. The following code:
1365 \code
1366 f(x, y) = x + y;
1367 f(x, y) += 5;
1368 g(x, y) = f(x, y);
1369 f.compute_root();
1370 \endcode
1371 *
1372 * Is equivalent to:
1373 \code
1374 for y:
1375 for x:
1376 f(x, y) = x + y;
1377 for y:
1378 for x:
1379 f(x, y) += 5
1380 for y:
1381 for x:
1382 g(x, y) = f(x, y)
1383 \endcode
1384 * using Func::in(), we can write:
1385 \code
1386 f(x, y) = x + y;
1387 f(x, y) += 5;
1388 g(x, y) = f(x, y);
1389 f.in(g).compute_root();
1390 \endcode
1391 * which instead produces:
1392 \code
1393 for y:
1394 for x:
1395 f(x, y) = x + y;
1396 f(x, y) += 5
1397 f_wrap(x, y) = f(x, y)
1398 for y:
1399 for x:
1400 g(x, y) = f_wrap(x, y)
1401 \endcode
1402 */
1403 Func in(const Func &f);
1404
1405 /** Create and return an identity wrapper shared by all the Funcs in
1406 * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1407 * this will throw an error. */
1408 Func in(const std::vector<Func> &fs);
1409
1410 /** Create and return a global identity wrapper, which wraps all calls to
1411 * this Func by any other Func. If a global wrapper already exists,
1412 * returns it. The global identity wrapper is only used by callers for
1413 * which no custom wrapper has been specified.
1414 */
1416
1417 /** Similar to \ref Func::in; however, instead of replacing the call to
1418 * this Func with an identity Func that refers to it, this replaces the
1419 * call with a clone of this Func.
1420 *
1421 * For example, f.clone_in(g) would rewrite a pipeline like this:
1422 \code
1423 f(x, y) = x + y;
1424 g(x, y) = f(x, y) + 2;
1425 h(x, y) = f(x, y) - 3;
1426 \endcode
1427 * into a pipeline like this:
1428 \code
1429 f(x, y) = x + y;
1430 f_clone(x, y) = x + y;
1431 g(x, y) = f_clone(x, y) + 2;
1432 h(x, y) = f(x, y) - 3;
1433 \endcode
1434 *
1435 */
1436 //@{
1437 Func clone_in(const Func &f);
1438 Func clone_in(const std::vector<Func> &fs);
1439 //@}
1440
1441 /** Declare that this function should be implemented by a call to
1442 * halide_buffer_copy with the given target device API. Asserts
1443 * that the Func has a pure definition which is a simple call to a
1444 * single input, and no update definitions. The wrapper Funcs
1445 * returned by in() are suitable candidates. Consumes all pure
1446 * variables, and rewrites the Func to have an extern definition
1447 * that calls halide_buffer_copy. */
1449
1450 /** Declare that this function should be implemented by a call to
1451 * halide_buffer_copy with a NULL target device API. Equivalent to
1452 * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1453 * pure definition which is a simple call to a single input, and
1454 * no update definitions. The wrapper Funcs returned by in() are
1455 * suitable candidates. Consumes all pure variables, and rewrites
1456 * the Func to have an extern definition that calls
1457 * halide_buffer_copy.
1458 *
1459 * Note that if the source Func is already valid in host memory,
1460 * this compiles to code that does the minimum number of calls to
1461 * memcpy.
1462 */
1464
1465 /** Split a dimension into inner and outer subdimensions with the
1466 * given names, where the inner dimension iterates from 0 to
1467 * factor-1. The inner and outer subdimensions can then be dealt
1468 * with using the other scheduling calls. It's ok to reuse the old
1469 * variable name as either the inner or outer variable. The final
1470 * argument specifies how the tail should be handled if the split
1471 * factor does not provably divide the extent. */
1472 Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1473
1474 /** Join two dimensions into a single fused dimenion. The fused
1475 * dimension covers the product of the extents of the inner and
1476 * outer dimensions given. */
1477 Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1478
1479 /** Mark a dimension to be traversed serially. This is the default. */
1480 Func &serial(const VarOrRVar &var);
1481
1482 /** Mark a dimension to be traversed in parallel */
1484
1485 /** Split a dimension by the given task_size, and the parallelize the
1486 * outer dimension. This creates parallel tasks that have size
1487 * task_size. After this call, var refers to the outer dimension of
1488 * the split. The inner dimension has a new anonymous name. If you
1489 * wish to mutate it, or schedule with respect to it, do the split
1490 * manually. */
1491 Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1492
1493 /** Mark a dimension to be computed all-at-once as a single
1494 * vector. The dimension should have constant extent -
1495 * e.g. because it is the inner dimension following a split by a
1496 * constant factor. For most uses of vectorize you want the two
1497 * argument form. The variable to be vectorized should be the
1498 * innermost one. */
1500
1501 /** Mark a dimension to be completely unrolled. The dimension
1502 * should have constant extent - e.g. because it is the inner
1503 * dimension following a split by a constant factor. For most uses
1504 * of unroll you want the two-argument form. */
1505 Func &unroll(const VarOrRVar &var);
1506
1507 /** Split a dimension by the given factor, then vectorize the
1508 * inner dimension. This is how you vectorize a loop of unknown
1509 * size. The variable to be vectorized should be the innermost
1510 * one. After this call, var refers to the outer dimension of the
1511 * split. 'factor' must be an integer. */
1512 Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1513
1514 /** Split a dimension by the given factor, then unroll the inner
1515 * dimension. This is how you unroll a loop of unknown size by
1516 * some constant factor. After this call, var refers to the outer
1517 * dimension of the split. 'factor' must be an integer. */
1518 Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1519
1520 /** Statically declare that the range over which a function should
1521 * be evaluated is given by the second and third arguments. This
1522 * can let Halide perform some optimizations. E.g. if you know
1523 * there are going to be 4 color channels, you can completely
1524 * vectorize the color channel dimension without the overhead of
1525 * splitting it up. If bounds inference decides that it requires
1526 * more of this function than the bounds you have stated, a
1527 * runtime error will occur when you try to run your pipeline. */
1528 Func &bound(const Var &var, Expr min, Expr extent);
1529
1530 /** Statically declare the range over which the function will be
1531 * evaluated in the general case. This provides a basis for the auto
1532 * scheduler to make trade-offs and scheduling decisions. The auto
1533 * generated schedules might break when the sizes of the dimensions are
1534 * very different from the estimates specified. These estimates are used
1535 * only by the auto scheduler if the function is a pipeline output. */
1536 Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1537
1538 /** Set (min, extent) estimates for all dimensions in the Func
1539 * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1540 * repeatedly, but slightly terser. The size of the estimates vector
1541 * must match the dimensionality of the Func. */
1542 Func &set_estimates(const Region &estimates);
1543
1544 /** Expand the region computed so that the min coordinates is
1545 * congruent to 'remainder' modulo 'modulus', and the extent is a
1546 * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1547 * the min and extent realized to be even, and calling
1548 * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1549 * to be even. The region computed always contains the region that
1550 * would have been computed without this directive, so no
1551 * assertions are injected.
1552 */
1553 Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1554
1555 /** Expand the region computed so that the extent is a
1556 * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1557 * the extent realized to be even. The region computed always contains the
1558 * region that would have been computed without this directive, so no
1559 * assertions are injected. (This is essentially equivalent to align_bounds(),
1560 * but always leaving the min untouched.)
1561 */
1562 Func &align_extent(const Var &var, Expr modulus);
1563
1564 /** Bound the extent of a Func's realization, but not its
1565 * min. This means the dimension can be unrolled or vectorized
1566 * even when its min is not fixed (for example because it is
1567 * compute_at tiles of another Func). This can also be useful for
1568 * forcing a function's allocation to be a fixed size, which often
1569 * means it can go on the stack. */
1570 Func &bound_extent(const Var &var, Expr extent);
1571
1572 /** Split two dimensions at once by the given factors, and then
1573 * reorder the resulting dimensions to be xi, yi, xo, yo from
1574 * innermost outwards. This gives a tiled traversal. */
1575 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1576 const VarOrRVar &xo, const VarOrRVar &yo,
1577 const VarOrRVar &xi, const VarOrRVar &yi,
1578 const Expr &xfactor, const Expr &yfactor,
1580
1581 /** A shorter form of tile, which reuses the old variable names as
1582 * the new outer dimensions */
1583 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1584 const VarOrRVar &xi, const VarOrRVar &yi,
1585 const Expr &xfactor, const Expr &yfactor,
1587
1588 /** A more general form of tile, which defines tiles of any dimensionality. */
1589 Func &tile(const std::vector<VarOrRVar> &previous,
1590 const std::vector<VarOrRVar> &outers,
1591 const std::vector<VarOrRVar> &inners,
1592 const std::vector<Expr> &factors,
1593 const std::vector<TailStrategy> &tails);
1594
1595 /** The generalized tile, with a single tail strategy to apply to all vars. */
1596 Func &tile(const std::vector<VarOrRVar> &previous,
1597 const std::vector<VarOrRVar> &outers,
1598 const std::vector<VarOrRVar> &inners,
1599 const std::vector<Expr> &factors,
1601
1602 /** Generalized tiling, reusing the previous names as the outer names. */
1603 Func &tile(const std::vector<VarOrRVar> &previous,
1604 const std::vector<VarOrRVar> &inners,
1605 const std::vector<Expr> &factors,
1607
1608 /** Reorder variables to have the given nesting order, from
1609 * innermost out */
1610 Func &reorder(const std::vector<VarOrRVar> &vars);
1611
1612 template<typename... Args>
1613 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1614 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1615 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1616 return reorder(collected_args);
1617 }
1618
1619 /** Rename a dimension. Equivalent to split with a inner size of one. */
1620 Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1621
1622 /** Specify that race conditions are permitted for this Func,
1623 * which enables parallelizing over RVars even when Halide cannot
1624 * prove that it is safe to do so. Use this with great caution,
1625 * and only if you can prove to yourself that this is safe, as it
1626 * may result in a non-deterministic routine that returns
1627 * different values at different times or on different machines. */
1629
1630 /** Issue atomic updates for this Func. This allows parallelization
1631 * on associative RVars. The function throws a compile error when
1632 * Halide fails to prove associativity. Use override_associativity_test
1633 * to disable the associativity test if you believe the function is
1634 * associative or the order of reduction variable execution does not
1635 * matter.
1636 * Halide compiles this into hardware atomic operations whenever possible,
1637 * and falls back to a mutex lock per storage element if it is impossible
1638 * to atomically update.
1639 * There are three possible outcomes of the compiled code:
1640 * atomic add, compare-and-swap loop, and mutex lock.
1641 * For example:
1642 *
1643 * hist(x) = 0;
1644 * hist(im(r)) += 1;
1645 * hist.compute_root();
1646 * hist.update().atomic().parallel();
1647 *
1648 * will be compiled to atomic add operations.
1649 *
1650 * hist(x) = 0;
1651 * hist(im(r)) = min(hist(im(r)) + 1, 100);
1652 * hist.compute_root();
1653 * hist.update().atomic().parallel();
1654 *
1655 * will be compiled to compare-and-swap loops.
1656 *
1657 * arg_max() = {0, im(0)};
1658 * Expr old_index = arg_max()[0];
1659 * Expr old_max = arg_max()[1];
1660 * Expr new_index = select(old_max < im(r), r, old_index);
1661 * Expr new_max = max(im(r), old_max);
1662 * arg_max() = {new_index, new_max};
1663 * arg_max.compute_root();
1664 * arg_max.update().atomic().parallel();
1665 *
1666 * will be compiled to updates guarded by a mutex lock,
1667 * since it is impossible to atomically update two different locations.
1668 *
1669 * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1670 * Compiling to other backends results in a compile error.
1671 * If an operation is compiled into a mutex lock, and is vectorized or is
1672 * compiled to CUDA or OpenCL, it also results in a compile error,
1673 * since per-element mutex lock on vectorized operation leads to a
1674 * deadlock.
1675 * Vectorization of predicated RVars (through rdom.where()) on CPU
1676 * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1677 * 8-bit and 16-bit atomics on GPU are also not supported. */
1678 Func &atomic(bool override_associativity_test = false);
1679
1680 /** Specialize a Func. This creates a special-case version of the
1681 * Func where the given condition is true. The most effective
1682 * conditions are those of the form param == value, and boolean
1683 * Params. Consider a simple example:
1684 \code
1685 f(x) = x + select(cond, 0, 1);
1686 f.compute_root();
1687 \endcode
1688 * This is equivalent to:
1689 \code
1690 for (int x = 0; x < width; x++) {
1691 f[x] = x + (cond ? 0 : 1);
1692 }
1693 \endcode
1694 * Adding the scheduling directive:
1695 \code
1696 f.specialize(cond)
1697 \endcode
1698 * makes it equivalent to:
1699 \code
1700 if (cond) {
1701 for (int x = 0; x < width; x++) {
1702 f[x] = x;
1703 }
1704 } else {
1705 for (int x = 0; x < width; x++) {
1706 f[x] = x + 1;
1707 }
1708 }
1709 \endcode
1710 * Note that the inner loops have been simplified. In the first
1711 * path Halide knows that cond is true, and in the second path
1712 * Halide knows that it is false.
1713 *
1714 * The specialized version gets its own schedule, which inherits
1715 * every directive made about the parent Func's schedule so far
1716 * except for its specializations. This method returns a handle to
1717 * the new schedule. If you wish to retrieve the specialized
1718 * sub-schedule again later, you can call this method with the
1719 * same condition. Consider the following example of scheduling
1720 * the specialized version:
1721 *
1722 \code
1723 f(x) = x;
1724 f.compute_root();
1725 f.specialize(width > 1).unroll(x, 2);
1726 \endcode
1727 * Assuming for simplicity that width is even, this is equivalent to:
1728 \code
1729 if (width > 1) {
1730 for (int x = 0; x < width/2; x++) {
1731 f[2*x] = 2*x;
1732 f[2*x + 1] = 2*x + 1;
1733 }
1734 } else {
1735 for (int x = 0; x < width/2; x++) {
1736 f[x] = x;
1737 }
1738 }
1739 \endcode
1740 * For this case, it may be better to schedule the un-specialized
1741 * case instead:
1742 \code
1743 f(x) = x;
1744 f.compute_root();
1745 f.specialize(width == 1); // Creates a copy of the schedule so far.
1746 f.unroll(x, 2); // Only applies to the unspecialized case.
1747 \endcode
1748 * This is equivalent to:
1749 \code
1750 if (width == 1) {
1751 f[0] = 0;
1752 } else {
1753 for (int x = 0; x < width/2; x++) {
1754 f[2*x] = 2*x;
1755 f[2*x + 1] = 2*x + 1;
1756 }
1757 }
1758 \endcode
1759 * This can be a good way to write a pipeline that splits,
1760 * vectorizes, or tiles, but can still handle small inputs.
1761 *
1762 * If a Func has several specializations, the first matching one
1763 * will be used, so the order in which you define specializations
1764 * is significant. For example:
1765 *
1766 \code
1767 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1768 f.specialize(cond1);
1769 f.specialize(cond2);
1770 \endcode
1771 * is equivalent to:
1772 \code
1773 if (cond1) {
1774 for (int x = 0; x < width; x++) {
1775 f[x] = x + a - (cond2 ? c : d);
1776 }
1777 } else if (cond2) {
1778 for (int x = 0; x < width; x++) {
1779 f[x] = x + b - c;
1780 }
1781 } else {
1782 for (int x = 0; x < width; x++) {
1783 f[x] = x + b - d;
1784 }
1785 }
1786 \endcode
1787 *
1788 * Specializations may in turn be specialized, which creates a
1789 * nested if statement in the generated code.
1790 *
1791 \code
1792 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1793 f.specialize(cond1).specialize(cond2);
1794 \endcode
1795 * This is equivalent to:
1796 \code
1797 if (cond1) {
1798 if (cond2) {
1799 for (int x = 0; x < width; x++) {
1800 f[x] = x + a - c;
1801 }
1802 } else {
1803 for (int x = 0; x < width; x++) {
1804 f[x] = x + a - d;
1805 }
1806 }
1807 } else {
1808 for (int x = 0; x < width; x++) {
1809 f[x] = x + b - (cond2 ? c : d);
1810 }
1811 }
1812 \endcode
1813 * To create a 4-way if statement that simplifies away all of the
1814 * ternary operators above, you could say:
1815 \code
1816 f.specialize(cond1).specialize(cond2);
1817 f.specialize(cond2);
1818 \endcode
1819 * or
1820 \code
1821 f.specialize(cond1 && cond2);
1822 f.specialize(cond1);
1823 f.specialize(cond2);
1824 \endcode
1825 *
1826 * Any prior Func which is compute_at some variable of this Func
1827 * gets separately included in all paths of the generated if
1828 * statement. The Var in the compute_at call to must exist in all
1829 * paths, but it may have been generated via a different path of
1830 * splits, fuses, and renames. This can be used somewhat
1831 * creatively. Consider the following code:
1832 \code
1833 g(x, y) = 8*x;
1834 f(x, y) = g(x, y) + 1;
1835 f.compute_root().specialize(cond);
1836 Var g_loop;
1837 f.specialize(cond).rename(y, g_loop);
1838 f.rename(x, g_loop);
1839 g.compute_at(f, g_loop);
1840 \endcode
1841 * When cond is true, this is equivalent to g.compute_at(f,y).
1842 * When it is false, this is equivalent to g.compute_at(f,x).
1843 */
1844 Stage specialize(const Expr &condition);
1845
1846 /** Add a specialization to a Func that always terminates execution
1847 * with a call to halide_error(). By itself, this is of limited use,
1848 * but can be useful to terminate chains of specialize() calls where
1849 * no "default" case is expected (thus avoiding unnecessary code generation).
1850 *
1851 * For instance, say we want to optimize a pipeline to process images
1852 * in planar and interleaved format; we might typically do something like:
1853 \code
1854 ImageParam im(UInt(8), 3);
1855 Func f = do_something_with(im);
1856 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1857 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1858 \endcode
1859 * This code will vectorize along rows for the planar case, and across pixel
1860 * components for the interleaved case... but there is an implicit "else"
1861 * for the unhandled cases, which generates unoptimized code. If we never
1862 * anticipate passing any other sort of images to this, we code streamline
1863 * our code by adding specialize_fail():
1864 \code
1865 ImageParam im(UInt(8), 3);
1866 Func f = do_something(im);
1867 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1868 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1869 f.specialize_fail("Unhandled image format");
1870 \endcode
1871 * Conceptually, this produces codes like:
1872 \code
1873 if (im.dim(0).stride() == 1) {
1874 do_something_planar();
1875 } else if (im.dim(2).stride() == 1) {
1876 do_something_interleaved();
1877 } else {
1878 halide_error("Unhandled image format");
1879 }
1880 \endcode
1881 *
1882 * Note that calling specialize_fail() terminates the specialization chain
1883 * for a given Func; you cannot create new specializations for the Func
1884 * afterwards (though you can retrieve handles to previous specializations).
1885 */
1886 void specialize_fail(const std::string &message);
1887
1888 /** Tell Halide that the following dimensions correspond to GPU
1889 * thread indices. This is useful if you compute a producer
1890 * function within the block indices of a consumer function, and
1891 * want to control how that function's dimensions map to GPU
1892 * threads. If the selected target is not an appropriate GPU, this
1893 * just marks those dimensions as parallel. */
1894 // @{
1896 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1898 // @}
1899
1900 /** The given dimension corresponds to the lanes in a GPU
1901 * warp. GPU warp lanes are distinguished from GPU threads by the
1902 * fact that all warp lanes run together in lockstep, which
1903 * permits lightweight communication of data from one lane to
1904 * another. */
1906
1907 /** Tell Halide to run this stage using a single gpu thread and
1908 * block. This is not an efficient use of your GPU, but it can be
1909 * useful to avoid copy-back for intermediate update stages that
1910 * touch a very small part of your Func. */
1912
1913 /** Tell Halide that the following dimensions correspond to GPU
1914 * block indices. This is useful for scheduling stages that will
1915 * run serially within each GPU block. If the selected target is
1916 * not ptx, this just marks those dimensions as parallel. */
1917 // @{
1919 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1920 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1921 // @}
1922
1923 /** Tell Halide that the following dimensions correspond to GPU
1924 * block indices and thread indices. If the selected target is not
1925 * ptx, these just mark the given dimensions as parallel. The
1926 * dimensions are consumed by this call, so do all other
1927 * unrolling, reordering, etc first. */
1928 // @{
1929 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1930 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1931 const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1932 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1933 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1934 // @}
1935
1936 /** Short-hand for tiling a domain and mapping the tile indices
1937 * to GPU block indices and the coordinates within each tile to
1938 * GPU thread indices. Consumes the variables given, so do all
1939 * other scheduling first. */
1940 // @{
1941 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1943 DeviceAPI device_api = DeviceAPI::Default_GPU);
1944
1945 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1947 DeviceAPI device_api = DeviceAPI::Default_GPU);
1948 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1949 const VarOrRVar &bx, const VarOrRVar &by,
1950 const VarOrRVar &tx, const VarOrRVar &ty,
1951 const Expr &x_size, const Expr &y_size,
1953 DeviceAPI device_api = DeviceAPI::Default_GPU);
1954
1955 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1956 const VarOrRVar &tx, const VarOrRVar &ty,
1957 const Expr &x_size, const Expr &y_size,
1959 DeviceAPI device_api = DeviceAPI::Default_GPU);
1960
1961 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1962 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1963 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1964 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1966 DeviceAPI device_api = DeviceAPI::Default_GPU);
1967 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1968 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1969 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1971 DeviceAPI device_api = DeviceAPI::Default_GPU);
1972 // @}
1973
1974 /** Schedule for execution on Hexagon. When a loop is marked with
1975 * Hexagon, that loop is executed on a Hexagon DSP. */
1977
1978 /** Prefetch data written to or read from a Func or an ImageParam by a
1979 * subsequent loop iteration, at an optionally specified iteration offset.
1980 * 'var' specifies at which loop level the prefetch calls should be inserted.
1981 * The final argument specifies how prefetch of region outside bounds
1982 * should be handled.
1983 *
1984 * For example, consider this pipeline:
1985 \code
1986 Func f, g;
1987 Var x, y;
1988 f(x, y) = x + y;
1989 g(x, y) = 2 * f(x, y);
1990 \endcode
1991 *
1992 * The following schedule:
1993 \code
1994 f.compute_root();
1995 g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1996 \endcode
1997 *
1998 * will inject prefetch call at the innermost loop of 'g' and generate
1999 * the following loop nest:
2000 * for y = ...
2001 * for x = ...
2002 * f(x, y) = x + y
2003 * for y = ..
2004 * for x = ...
2005 * prefetch(&f[x + 2, y], 1, 16);
2006 * g(x, y) = 2 * f(x, y)
2007 */
2008 // @{
2009 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2010 Func &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
2011 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2012 return prefetch(f, var, var, offset, strategy);
2013 }
2014 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2015 Func &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
2016 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2017 return prefetch(param, var, var, offset, strategy);
2018 }
2019 template<typename T>
2020 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2021 Func &prefetch(const T &image, VarOrRVar var, int offset = 1,
2022 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2023 return prefetch<T>(image, var, var, offset, strategy);
2024 }
2025 // @}
2026
2027 /** prefetch() is a more fine-grained version of prefetch(), which allows
2028 * specification of different vars for the location of the prefetch() instruction
2029 * vs. the location that is being prefetched:
2030 *
2031 * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
2032 * - the second var specified, 'from', determines the var used to find the bounds to prefetch
2033 * (in conjunction with 'offset')
2034 *
2035 * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
2036 * Note that the value for 'offset' applies only to 'from', not 'at'.
2037 *
2038 * For example, consider this pipeline:
2039 \code
2040 Func f, g;
2041 Var x, y, z;
2042 f(x, y) = x + y;
2043 g(x, y) = 2 * f(x, y);
2044 h(x, y) = 3 * f(x, y);
2045 \endcode
2046 *
2047 * The following schedule:
2048 \code
2049 f.compute_root();
2050 g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
2051 h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
2052 \endcode
2053 *
2054 * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
2055 * the following loop nest:
2056 \code
2057 for y = ...
2058 for x = ...
2059 f(x, y) = x + y
2060 for y = ..
2061 for x = ...
2062 prefetch(&f[x + 2, y], 1, 16);
2063 g(x, y) = 2 * f(x, y)
2064 for y = ..
2065 for x = ...
2066 prefetch(&f[x, y + 2], 1, 16);
2067 h(x, y) = 3 * f(x, y)
2068 \endcode
2069 *
2070 * Note that the 'from' nesting level need not be adjacent to 'at':
2071 \code
2072 Func f, g;
2073 Var x, y, z, w;
2074 f(x, y, z, w) = x + y + z + w;
2075 g(x, y, z, w) = 2 * f(x, y, z, w);
2076 \endcode
2077 *
2078 * The following schedule:
2079 \code
2080 f.compute_root();
2081 g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2082 \endcode
2083 *
2084 * will produce code that prefetches a tile of data:
2085 \code
2086 for w = ...
2087 for z = ...
2088 for y = ...
2089 for x = ...
2090 f(x, y, z, w) = x + y + z + w
2091 for w = ...
2092 for z = ...
2093 for y = ...
2094 for x0 = ...
2095 prefetch(&f[x0, y, z, w + 2], 1, 16);
2096 for x = ...
2097 g(x, y, z, w) = 2 * f(x, y, z, w)
2098 \endcode
2099 *
2100 * Note that calling prefetch() with the same var for both 'at' and 'from'
2101 * is equivalent to calling prefetch() with that var.
2102 */
2103 // @{
2104 Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2106 Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2108 template<typename T>
2109 Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2111 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2112 }
2113 // @}
2114
2115 /** Specify how the storage for the function is laid out. These
2116 * calls let you specify the nesting order of the dimensions. For
2117 * example, foo.reorder_storage(y, x) tells Halide to use
2118 * column-major storage for any realizations of foo, without
2119 * changing how you refer to foo in the code. You may want to do
2120 * this if you intend to vectorize across y. When representing
2121 * color images, foo.reorder_storage(c, x, y) specifies packed
2122 * storage (red, green, and blue values adjacent in memory), and
2123 * foo.reorder_storage(x, y, c) specifies planar storage (entire
2124 * red, green, and blue images one after the other in memory).
2125 *
2126 * If you leave out some dimensions, those remain in the same
2127 * positions in the nesting order while the specified variables
2128 * are reordered around them. */
2129 // @{
2130 Func &reorder_storage(const std::vector<Var> &dims);
2131
2132 Func &reorder_storage(const Var &x, const Var &y);
2133 template<typename... Args>
2134 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
2135 reorder_storage(const Var &x, const Var &y, Args &&...args) {
2136 std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2137 return reorder_storage(collected_args);
2138 }
2139 // @}
2140
2141 /** Pad the storage extent of a particular dimension of
2142 * realizations of this function up to be a multiple of the
2143 * specified alignment. This guarantees that the strides for the
2144 * dimensions stored outside of dim will be multiples of the
2145 * specified alignment, where the strides and alignment are
2146 * measured in numbers of elements.
2147 *
2148 * For example, to guarantee that a function foo(x, y, c)
2149 * representing an image has scanlines starting on offsets
2150 * aligned to multiples of 16, use foo.align_storage(x, 16). */
2151 Func &align_storage(const Var &dim, const Expr &alignment);
2152
2153 /** Store realizations of this function in a circular buffer of a
2154 * given extent. This is more efficient when the extent of the
2155 * circular buffer is a power of 2. If the fold factor is too
2156 * small, or the dimension is not accessed monotonically, the
2157 * pipeline will generate an error at runtime.
2158 *
2159 * The fold_forward option indicates that the new values of the
2160 * producer are accessed by the consumer in a monotonically
2161 * increasing order. Folding storage of producers is also
2162 * supported if the new values are accessed in a monotonically
2163 * decreasing order by setting fold_forward to false.
2164 *
2165 * For example, consider the pipeline:
2166 \code
2167 Func f, g;
2168 Var x, y;
2169 g(x, y) = x*y;
2170 f(x, y) = g(x, y) + g(x, y+1);
2171 \endcode
2172 *
2173 * If we schedule f like so:
2174 *
2175 \code
2176 g.compute_at(f, y).store_root().fold_storage(y, 2);
2177 \endcode
2178 *
2179 * Then g will be computed at each row of f and stored in a buffer
2180 * with an extent in y of 2, alternately storing each computed row
2181 * of g in row y=0 or y=1.
2182 */
2183 Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2184
2185 /** Compute this function as needed for each unique value of the
2186 * given var for the given calling function f.
2187 *
2188 * For example, consider the simple pipeline:
2189 \code
2190 Func f, g;
2191 Var x, y;
2192 g(x, y) = x*y;
2193 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2194 \endcode
2195 *
2196 * If we schedule f like so:
2197 *
2198 \code
2199 g.compute_at(f, x);
2200 \endcode
2201 *
2202 * Then the C code equivalent to this pipeline will look like this
2203 *
2204 \code
2205
2206 int f[height][width];
2207 for (int y = 0; y < height; y++) {
2208 for (int x = 0; x < width; x++) {
2209 int g[2][2];
2210 g[0][0] = x*y;
2211 g[0][1] = (x+1)*y;
2212 g[1][0] = x*(y+1);
2213 g[1][1] = (x+1)*(y+1);
2214 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2215 }
2216 }
2217
2218 \endcode
2219 *
2220 * The allocation and computation of g is within f's loop over x,
2221 * and enough of g is computed to satisfy all that f will need for
2222 * that iteration. This has excellent locality - values of g are
2223 * used as soon as they are computed, but it does redundant
2224 * work. Each value of g ends up getting computed four times. If
2225 * we instead schedule f like so:
2226 *
2227 \code
2228 g.compute_at(f, y);
2229 \endcode
2230 *
2231 * The equivalent C code is:
2232 *
2233 \code
2234 int f[height][width];
2235 for (int y = 0; y < height; y++) {
2236 int g[2][width+1];
2237 for (int x = 0; x < width; x++) {
2238 g[0][x] = x*y;
2239 g[1][x] = x*(y+1);
2240 }
2241 for (int x = 0; x < width; x++) {
2242 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2243 }
2244 }
2245 \endcode
2246 *
2247 * The allocation and computation of g is within f's loop over y,
2248 * and enough of g is computed to satisfy all that f will need for
2249 * that iteration. This does less redundant work (each point in g
2250 * ends up being evaluated twice), but the locality is not quite
2251 * as good, and we have to allocate more temporary memory to store
2252 * g.
2253 */
2254 Func &compute_at(const Func &f, const Var &var);
2255
2256 /** Schedule a function to be computed within the iteration over
2257 * some dimension of an update domain. Produces equivalent code
2258 * to the version of compute_at that takes a Var. */
2259 Func &compute_at(const Func &f, const RVar &var);
2260
2261 /** Schedule a function to be computed within the iteration over
2262 * a given LoopLevel. */
2264
2265 /** Schedule the iteration over the initial definition of this function
2266 * to be fused with another stage 's' from outermost loop to a
2267 * given LoopLevel. */
2268 // @{
2269 Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2271 Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2273
2274 /** Compute all of this function once ahead of time. Reusing
2275 * the example in \ref Func::compute_at :
2276 *
2277 \code
2278 Func f, g;
2279 Var x, y;
2280 g(x, y) = x*y;
2281 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2282
2283 g.compute_root();
2284 \endcode
2285 *
2286 * is equivalent to
2287 *
2288 \code
2289 int f[height][width];
2290 int g[height+1][width+1];
2291 for (int y = 0; y < height+1; y++) {
2292 for (int x = 0; x < width+1; x++) {
2293 g[y][x] = x*y;
2294 }
2295 }
2296 for (int y = 0; y < height; y++) {
2297 for (int x = 0; x < width; x++) {
2298 f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2299 }
2300 }
2301 \endcode
2302 *
2303 * g is computed once ahead of time, and enough is computed to
2304 * satisfy all uses of it. This does no redundant work (each point
2305 * in g is evaluated once), but has poor locality (values of g are
2306 * probably not still in cache when they are used by f), and
2307 * allocates lots of temporary memory to store g.
2308 */
2310
2311 /** Use the halide_memoization_cache_... interface to store a
2312 * computed version of this function across invocations of the
2313 * Func.
2314 *
2315 * If an eviction_key is provided, it must be constructed with
2316 * Expr of integer or handle type. The key Expr will be promoted
2317 * to a uint64_t and can be used with halide_memoization_cache_evict
2318 * to remove memoized entries using this eviction key from the
2319 * cache. Memoized computations that do not provide an eviction
2320 * key will never be evicted by this mechanism.
2321 */
2322 Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2323
2324 /** Produce this Func asynchronously in a separate
2325 * thread. Consumers will be run by the task system when the
2326 * production is complete. If this Func's store level is different
2327 * to its compute level, consumers will be run concurrently,
2328 * blocking as necessary to prevent reading ahead of what the
2329 * producer has computed. If storage is folded, then the producer
2330 * will additionally not be permitted to run too far ahead of the
2331 * consumer, to avoid clobbering data that has not yet been
2332 * used.
2333 *
2334 * Take special care when combining this with custom thread pool
2335 * implementations, as avoiding deadlock with producer-consumer
2336 * parallelism requires a much more sophisticated parallel runtime
2337 * than with data parallelism alone. It is strongly recommended
2338 * you just use Halide's default thread pool, which guarantees no
2339 * deadlock and a bound on the number of threads launched.
2340 */
2342
2343 /** Allocate storage for this function within f's loop over
2344 * var. Scheduling storage is optional, and can be used to
2345 * separate the loop level at which storage occurs from the loop
2346 * level at which computation occurs to trade off between locality
2347 * and redundant work. This can open the door for two types of
2348 * optimization.
2349 *
2350 * Consider again the pipeline from \ref Func::compute_at :
2351 \code
2352 Func f, g;
2353 Var x, y;
2354 g(x, y) = x*y;
2355 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2356 \endcode
2357 *
2358 * If we schedule it like so:
2359 *
2360 \code
2361 g.compute_at(f, x).store_at(f, y);
2362 \endcode
2363 *
2364 * Then the computation of g takes place within the loop over x,
2365 * but the storage takes place within the loop over y:
2366 *
2367 \code
2368 int f[height][width];
2369 for (int y = 0; y < height; y++) {
2370 int g[2][width+1];
2371 for (int x = 0; x < width; x++) {
2372 g[0][x] = x*y;
2373 g[0][x+1] = (x+1)*y;
2374 g[1][x] = x*(y+1);
2375 g[1][x+1] = (x+1)*(y+1);
2376 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2377 }
2378 }
2379 \endcode
2380 *
2381 * Provided the for loop over x is serial, halide then
2382 * automatically performs the following sliding window
2383 * optimization:
2384 *
2385 \code
2386 int f[height][width];
2387 for (int y = 0; y < height; y++) {
2388 int g[2][width+1];
2389 for (int x = 0; x < width; x++) {
2390 if (x == 0) {
2391 g[0][x] = x*y;
2392 g[1][x] = x*(y+1);
2393 }
2394 g[0][x+1] = (x+1)*y;
2395 g[1][x+1] = (x+1)*(y+1);
2396 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2397 }
2398 }
2399 \endcode
2400 *
2401 * Two of the assignments to g only need to be done when x is
2402 * zero. The rest of the time, those sites have already been
2403 * filled in by a previous iteration. This version has the
2404 * locality of compute_at(f, x), but allocates more memory and
2405 * does much less redundant work.
2406 *
2407 * Halide then further optimizes this pipeline like so:
2408 *
2409 \code
2410 int f[height][width];
2411 for (int y = 0; y < height; y++) {
2412 int g[2][2];
2413 for (int x = 0; x < width; x++) {
2414 if (x == 0) {
2415 g[0][0] = x*y;
2416 g[1][0] = x*(y+1);
2417 }
2418 g[0][(x+1)%2] = (x+1)*y;
2419 g[1][(x+1)%2] = (x+1)*(y+1);
2420 f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2421 }
2422 }
2423 \endcode
2424 *
2425 * Halide has detected that it's possible to use a circular buffer
2426 * to represent g, and has reduced all accesses to g modulo 2 in
2427 * the x dimension. This optimization only triggers if the for
2428 * loop over x is serial, and if halide can statically determine
2429 * some power of two large enough to cover the range needed. For
2430 * powers of two, the modulo operator compiles to more efficient
2431 * bit-masking. This optimization reduces memory usage, and also
2432 * improves locality by reusing recently-accessed memory instead
2433 * of pulling new memory into cache.
2434 *
2435 */
2436 Func &store_at(const Func &f, const Var &var);
2437
2438 /** Equivalent to the version of store_at that takes a Var, but
2439 * schedules storage within the loop over a dimension of a
2440 * reduction domain */
2441 Func &store_at(const Func &f, const RVar &var);
2442
2443 /** Equivalent to the version of store_at that takes a Var, but
2444 * schedules storage at a given LoopLevel. */
2446
2447 /** Equivalent to \ref Func::store_at, but schedules storage
2448 * outside the outermost loop. */
2450
2451 /** Aggressively inline all uses of this function. This is the
2452 * default schedule, so you're unlikely to need to call this. For
2453 * a Func with an update definition, that means it gets computed
2454 * as close to the innermost loop as possible.
2455 *
2456 * Consider once more the pipeline from \ref Func::compute_at :
2457 *
2458 \code
2459 Func f, g;
2460 Var x, y;
2461 g(x, y) = x*y;
2462 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2463 \endcode
2464 *
2465 * Leaving g as inline, this compiles to code equivalent to the following C:
2466 *
2467 \code
2468 int f[height][width];
2469 for (int y = 0; y < height; y++) {
2470 for (int x = 0; x < width; x++) {
2471 f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2472 }
2473 }
2474 \endcode
2475 */
2477
2478 /** Get a handle on an update step for the purposes of scheduling
2479 * it. */
2480 Stage update(int idx = 0);
2481
2482 /** Set the type of memory this Func should be stored in. Controls
2483 * whether allocations go on the stack or the heap on the CPU, and
2484 * in global vs shared vs local on the GPU. See the documentation
2485 * on MemoryType for more detail. */
2486 Func &store_in(MemoryType memory_type);
2487
2488 /** Trace all loads from this Func by emitting calls to
2489 * halide_trace. If the Func is inlined, this has no
2490 * effect. */
2492
2493 /** Trace all stores to the buffer backing this Func by emitting
2494 * calls to halide_trace. If the Func is inlined, this call
2495 * has no effect. */
2497
2498 /** Trace all realizations of this Func by emitting calls to
2499 * halide_trace. */
2501
2502 /** Add a string of arbitrary text that will be passed thru to trace
2503 * inspection code if the Func is realized in trace mode. (Funcs that are
2504 * inlined won't have their tags emitted.) Ignored entirely if
2505 * tracing is not enabled for the Func (or globally).
2506 */
2507 Func &add_trace_tag(const std::string &trace_tag);
2508
2509 /** Get a handle on the internal halide function that this Func
2510 * represents. Useful if you want to do introspection on Halide
2511 * functions */
2512 Internal::Function function() const {
2513 return func;
2514 }
2515
2516 /** You can cast a Func to its pure stage for the purposes of
2517 * scheduling it. */
2518 operator Stage() const;
2519
2520 /** Get a handle on the output buffer for this Func. Only relevant
2521 * if this is the output Func in a pipeline. Useful for making
2522 * static promises about strides, mins, and extents. */
2523 // @{
2525 std::vector<OutputImageParam> output_buffers() const;
2526 // @}
2527
2528 /** Use a Func as an argument to an external stage. */
2529 operator ExternFuncArgument() const;
2530
2531 /** Infer the arguments to the Func, sorted into a canonical order:
2532 * all buffers (sorted alphabetically by name), followed by all non-buffers
2533 * (sorted alphabetically by name).
2534 This lets you write things like:
2535 \code
2536 func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2537 \endcode
2538 */
2539 std::vector<Argument> infer_arguments() const;
2540
2541 /** Get the source location of the pure definition of this
2542 * Func. See Stage::source_location() */
2543 std::string source_location() const;
2544
2545 /** Return the current StageSchedule associated with this initial
2546 * Stage of this Func. For introspection only: to modify schedule,
2547 * use the Func interface. */
2549 return Stage(*this).get_schedule();
2550 }
2551};
2552
2553namespace Internal {
2554
2555template<typename Last>
2556inline void check_types(const Tuple &t, int idx) {
2557 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2558 user_assert(t[idx].type() == type_of<T>())
2559 << "Can't evaluate expression "
2560 << t[idx] << " of type " << t[idx].type()
2561 << " as a scalar of type " << type_of<T>() << "\n";
2562}
2563
2564template<typename First, typename Second, typename... Rest>
2565inline void check_types(const Tuple &t, int idx) {
2566 check_types<First>(t, idx);
2567 check_types<Second, Rest...>(t, idx + 1);
2568}
2569
2570template<typename Last>
2571inline void assign_results(Realization &r, int idx, Last last) {
2572 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2573 *last = Buffer<T>(r[idx])();
2574}
2575
2576template<typename First, typename Second, typename... Rest>
2577inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2578 assign_results<First>(r, idx, first);
2579 assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2580}
2581
2582} // namespace Internal
2583
2584/** JIT-Compile and run enough code to evaluate a Halide
2585 * expression. This can be thought of as a scalar version of
2586 * \ref Func::realize */
2587template<typename T>
2589 user_assert(e.type() == type_of<T>())
2590 << "Can't evaluate expression "
2591 << e << " of type " << e.type()
2592 << " as a scalar of type " << type_of<T>() << "\n";
2593 Func f;
2594 f() = e;
2595 Buffer<T> im = f.realize();
2596 return im();
2597}
2598
2599/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2600template<typename First, typename... Rest>
2601HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2602 Internal::check_types<First, Rest...>(t, 0);
2603
2604 Func f;
2605 f() = t;
2606 Realization r = f.realize();
2607 Internal::assign_results(r, 0, first, rest...);
2608}
2609
2610namespace Internal {
2611
2612inline void schedule_scalar(Func f) {
2614 if (t.has_gpu_feature()) {
2616 }
2617 if (t.has_feature(Target::HVX)) {
2618 f.hexagon();
2619 }
2620}
2621
2622} // namespace Internal
2623
2624/** JIT-Compile and run enough code to evaluate a Halide
2625 * expression. This can be thought of as a scalar version of
2626 * \ref Func::realize. Can use GPU if jit target from environment
2627 * specifies one.
2628 */
2629template<typename T>
2631 user_assert(e.type() == type_of<T>())
2632 << "Can't evaluate expression "
2633 << e << " of type " << e.type()
2634 << " as a scalar of type " << type_of<T>() << "\n";
2635 Func f;
2636 f() = e;
2638 Buffer<T> im = f.realize();
2639 return im();
2640}
2641
2642/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2643 * use GPU if jit target from environment specifies one. */
2644// @{
2645template<typename First, typename... Rest>
2646HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2647 Internal::check_types<First, Rest...>(t, 0);
2648
2649 Func f;
2650 f() = t;
2652 Realization r = f.realize();
2653 Internal::assign_results(r, 0, first, rest...);
2654}
2655// @}
2656
2657} // namespace Halide
2658
2659#endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Errors.h:19
#define user_assert(c)
Definition: Errors.h:15
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
#define HALIDE_ATTRIBUTE_DEPRECATED(x)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:45
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:115
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:683
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:689
A halide function.
Definition: Func.h:698
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_... interface to store a computed version of this function across in...
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void realize(Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function into an existing allocated buffer or buffers.
Func & async()
Produce this Func asynchronously in a separate thread.
void set_custom_trace(int(*trace_fn)(void *, const halide_trace_event_t *))
Set custom routines to call when tracing is enabled.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1310
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
bool defined() const
Does this function have at least a pure definition.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Func & reorder_storage(const Var &x, const Var &y)
const Internal::JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
const std::vector< Type > & output_types() const
Get the types of the outputs of this Func.
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
int dimensions() const
The dimensionality (number of arguments) of this function.
void set_custom_do_par_for(int(*custom_do_par_for)(void *, int(*)(void *, int, uint8_t *), int, int, uint8_t *))
Set a custom parallel for loop launcher.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string source_location() const
Get the source location of the pure definition of this Func.
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
int outputs() const
Get the number of outputs of this Func.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2135
Func & compute_root()
Compute all of this function once ahead of time.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void set_custom_allocator(void *(*malloc)(void *, size_t), void(*free)(void *, void *))
Set a custom malloc and free for halide to use.
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
std::vector< Var > args() const
Get the pure arguments.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1614
int num_update_definitions() const
How many update definitions does this function have?
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
Stage specialize(const Expr &condition)
Specialize a Func.
void set_custom_do_task(int(*custom_do_task)(void *, int(*)(void *, int, uint8_t *), int, uint8_t *))
Set a custom task handler to be called by the parallel for loop.
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
void set_error_handler(void(*handler)(void *, const char *))
Set the error handler function that be called in the case of runtime errors during halide pipelines.
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1240
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
void set_custom_print(void(*handler)(void *, const char *))
Set the function called to print messages from the runtime.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1127
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
void compile_to(const std::map< Output, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1222
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Definition: Func.h:2010
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
prefetch() is a more fine-grained version of prefetch(), which allows specification of different vars...
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:744
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1293
Func & compute_inline()
Aggressively inline all uses of this function.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Tuple values() const
The values returned by this function.
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1250
Func & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimenion.
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
std::vector< OutputImageParam > output_buffers() const
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2109
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2548
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1232
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:489
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:586
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:608
int index() const
Return index to the function outputs.
Definition: Func.h:672
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
bool defined() const
Definition objects are nullable.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:29
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:643
bool & touched()
This flag is set to true if the dims list has been manipulated by the user (or if a ScheduleHandle wa...
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A halide module.
Definition: Module.h:135
A handle on the output buffer of a pipeline.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:104
A class representing a Halide pipeline.
Definition: Pipeline.h:99
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:21
A single definition of a Func.
Definition: Func.h:70
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
std::string name() const
Return the name of this stage, e.g.
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:379
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func rfactor(const RVar &r, const Var &v)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:446
Func rfactor(std::vector< std::pair< RVar, Var > > preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & allow_race_conditions()
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage specialize(const Expr &condition)
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:466
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:94
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & parallel(const VarOrRVar &var)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:108
Stage & serial(const VarOrRVar &var)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
void schedule_scalar(Func f)
Definition: Func.h:2612
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2571
void check_types(const Tuple &t, int idx)
Definition: Func.h:2556
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:395
WEAK halide_do_task_t custom_do_task
WEAK halide_do_par_for_t custom_do_par_for
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2588
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2630
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:110
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:595
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:24
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:63
@ Text
Definition: Pipeline.h:64
Stage ScheduleHandle
Definition: Func.h:480
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:343
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:598
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:346
void * malloc(size_t)
unsigned __INT8_TYPE__ uint8_t
void free(void *)
A fragment of Halide syntax.
Definition: Expr.h:256
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:320
An argument to an extern-defined Func.
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:265
A class that can represent Vars or RVars.
Definition: Func.h:30
bool is_rvar
Definition: Func.h:58
VarOrRVar(const Var &v)
Definition: Func.h:34
VarOrRVar(const RVar &r)
Definition: Func.h:37
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
const std::string & name() const
Definition: Func.h:48
VarOrRVar(const RDom &r)
Definition: Func.h:40