How does halide aot debug the running process?

  aot, c++, debugging, halide
  1. Add -g compile option after crash, but can’t find debug information
    ./bin/op_generator -g halide_u8_add ${GENERATE_FILES} -o $(X86_64_OPGEN_DIR) target=host
HALIDE_DIR ?= ../3rdparty/halide
OPGEN_DIR ?= ../src/op_gen
X86_64_OPGEN_DIR = $(OPGEN_DIR)/x86_64
# X86_64_OPGEN_DIR = ./bin
# AARCH64_OPGEN_DIR = ./bin
GENERATE_FILES = -e c_header,static_library,assembly,llvm_assembly,stmt_html,c_source

CXX = clang++
CC = clang

bin/op_generator: add_u8_generator.cpp add_f32_generator.cpp blur_generator.cpp mul_u8_generator.cpp mul_f32_generator.cpp fft_generator.cpp fft.cpp
    $(CXX) $(HALIDE_DIR)/share/tools/GenGen.cpp add_u8_generator.cpp add_f32_generator.cpp  blur_generator.cpp mul_u8_generator.cpp mul_f32_generator.cpp fft_generator.cpp fft.cpp
           -I$(HALIDE_DIR)/include -L$(HALIDE_DIR)/lib -lHalide  -Wl,-rpath $(HALIDE_DIR)/lib 
           -DHALIDE_WITH_EXCEPTIONS -fno-rtti -std=c++17 -pthread -ldl -o bin/op_generator
halide_u8_add.a : bin/op_generator
    ./bin/op_generator -g halide_u8_add ${GENERATE_FILES} -o $(X86_64_OPGEN_DIR) target=host
    ./bin/op_generator -g halide_u8_add ${GENERATE_FILES} -o $(AARCH64_OPGEN_DIR) target=arm-64-android
    # ./bin/op_generator -g halide_u8_add ${GENERATE_FILES} -o $(AARCH64_OPGEN_DIR) target=arm-64-android-opencl
  1. When the value of the error, aot internal can not gdb debugging, or add print information, resulting in the inability to single-step debugging.printf can only print the debugging compilation process. The runtime process cannot be debugged.This is my program:
#include "Halide.h"

namespace {

enum class AddGPUSchedule {
    Inline,  // Fully inlining schedule.
    Cache,   // Schedule caching intermedia result of Add_x.
    Slide,   // Schedule enabling sliding window opt within each
    // work-item or cuda thread.
    SlideVectorize,  // The same as above plus vectorization per work-item.

std::map<std::string, AddGPUSchedule> addGPUScheduleEnumMap() {
    return {
        {"inline", AddGPUSchedule::Inline},
        {"cache", AddGPUSchedule::Cache},
        {"slide", AddGPUSchedule::Slide},
        {"slide_vector", AddGPUSchedule::SlideVectorize},

class HalideAdd : public Halide::Generator<HalideAdd> {
    GeneratorParam<AddGPUSchedule> schedule{
        "schedule", AddGPUSchedule::SlideVectorize, addGPUScheduleEnumMap()};
    GeneratorParam<int> tile_x{"tile_x", 32};  // X tile.
    GeneratorParam<int> tile_y{"tile_y", 8};   // Y tile.

    Input<Buffer<uint8_t>> input_a{"inputa", 2};
    Input<Buffer<uint8_t>> input_b{"inputb", 2};
    Output<Buffer<uint8_t>> add_y{"add_y", 2};

    void generate() {
        Func add_x("add_x");
        Var x("x"), y("y"), xi("xi"), yi("yi");

        #if 1
        add_x(x,y) = cast<uint16_t>(cast<uint16_t>(input_a(x, y)) + cast<uint16_t>(input_b(x , y)));
        add_y(x, y) = cast<uint8_t>(min(max(add_x(x,y),0),255));
        // How to schedule it
        if (get_target().has_gpu_feature()) {
            // GPU schedule.a
            switch (schedule) {
            case AddGPUSchedule::Inline:
                // - Fully inlining.
                add_y.gpu_tile(x, y, xi, yi, tile_x, tile_y);
            case AddGPUSchedule::Cache:
                // - Cache add_x calculation.
                add_y.gpu_tile(x, y, xi, yi, tile_x, tile_y);
                add_x.compute_at(add_y, x).gpu_threads(x, y);
            case AddGPUSchedule::Slide: {
                // - Instead caching add_x calculation explicitly, the
                //   alternative is to allow each work-item in OpenCL or thread
                //   in CUDA to calculate more rows of add_y so that temporary
                //   add_x calculation is re-used implicitly. This achieves
                //   the similar schedule of sliding window.
                Var y_inner("y_inner");
                .split(y, y, y_inner, tile_y)
                .reorder(y_inner, x)
                .gpu_tile(x, y, xi, yi, tile_x, 1);
            case AddGPUSchedule::SlideVectorize: {
                // Vectorization factor.
                int factor = sizeof(int) / sizeof(short);
                Var y_inner("y_inner");
                add_y.vectorize(x, factor)
                .split(y, y, y_inner, tile_y)
                .reorder(y_inner, x)
                .gpu_tile(x, y, xi, yi, tile_x, 1);
        } else if (get_target().has_feature(Target::HVX)) {
            // Hexagon schedule.
            const int vector_size = 128;

            .prefetch(input_a, y, 2)
            .split(y, y, yi, 128)
            .vectorize(x, vector_size * 2);
            .store_at(add_y, y)
            .compute_at(add_y, yi)
            .vectorize(x, vector_size);
        } else {
            // CPU schedule.
            add_y.split(y, y, yi, 8).parallel(y).vectorize(x, 8);
            add_x.store_at(add_y, y).compute_at(add_y, yi).vectorize(x,

}  // namespace

HALIDE_REGISTER_GENERATOR(HalideAdd, halide_u8_add)

  1. How to analyze halide’s stml
    [enter image description here][1]

  2. how to debug by addembly information
    How does halide aot debug the running process?
    5.I can find the following debugging method for jit, does the same debugging method exist under aot

    1. Func.trace_stores() tracks the results of the computation when the function is running
    2. Func.parallel(y) multi-threaded parallel computation in some domain direction
    3. print() prints the value of the expression of interest
    4. print_when() prints the value if the specified condition is true, and can also be used to mask the output if the condition is false
    5. output complex expressions with c++ output stream to check if the expression construction is as expected

Source: Windows Questions C++