Illegal memory access when deallocating OptiX geometry acceleration structure

  ambient, c++, cuda, graphics, optix

I am trying to implement baked ambient occlusion using Optix 7.2 on my GTX 1070 with CUDA 11.2.1-2 and GCC 10.2.0. So far I have the following code to load a vtkPolydata triangular mesh onto the GPU, and setup the Optix pipeline.

This is the cpp file of my GpuAmbientOcclusion class.

    GpuAmbientOcclusion::GpuAmbientOcclusion()
    {

    }
 
    /**
     * OptiX context log callback function. Used by the OptiX context to log output.
     *
     * @param level Log level
     * @param tag Log tag
     * @param message Log message
     */
    static void optixContextLogCallback(unsigned int level, const char* tag, const char* message, void* /*cbdata */)
    {
        std::cerr << "[" << std::setw(2) << level << "][" << std::setw(12) << tag << "]: "
                  << message << "n";
    }

    struct OptixMesh
    {
        float3* vertices;
        uint3* indices;
        float3* normals;

        unsigned short* rayHits;

        unsigned int numVertices;
        unsigned int numTriangles;
    };

    OptixMesh optixMeshFromVtkPolydata(const vtkSmartPointer<vtkPolyData>& polydata,
                                       const vtkSmartPointer<vtkFloatArray>& normals, AoState& aoState)
    {
        const auto numVertices = static_cast<unsigned int>(polydata->GetNumberOfPoints());
        const auto numTriangles = static_cast<unsigned int>(polydata->GetNumberOfCells());

        // Allocate memory for OptiX mesh
        OptixMesh mesh;

        mesh.numVertices = numVertices;
        mesh.numTriangles = numTriangles;

        // Access VTK buffer arrays for fast copying
        // Need to copy vtk vertices to new array casted as floats
        const double* vtkVertices = vtkDoubleArray::SafeDownCast(polydata->GetPoints()->GetData())->GetPointer(0);
        const float* vtkNormals = normals->GetPointer(0);

        auto* cpuVertices = (float3*) malloc(sizeof(float3) * numVertices);
        auto* cpuNormals = (float3*) malloc(sizeof(float3) * numVertices);
        for (unsigned int i = 0, j = 0; i < numVertices; ++i, j += 3)
        {
            cpuVertices[i] = make_float3(static_cast<float>(vtkVertices[j]), static_cast<float>(vtkVertices[j + 1]),
                                         static_cast<float>(vtkVertices[j + 2]));
            cpuNormals[i] = make_float3(vtkNormals[j], vtkNormals[j + 1], vtkNormals[j + 2]);
        }

        // Need to copy vtk indices to new array casted as unsigned ints
        const long long* vtkIndices = vtkTypeInt64Array::SafeDownCast(
                polydata->GetPolys()->GetConnectivityArray())->GetPointer(0);

        auto* cpuIndices = (uint3*) malloc(sizeof(uint3) * numTriangles);
        for (unsigned int i = 0, j = 0; i < numTriangles; ++i, j += 3)
        {
            cpuIndices[i] = make_uint3(static_cast<unsigned int>(vtkIndices[j]),
                                       static_cast<unsigned int>(vtkIndices[j + 1]),
                                       static_cast<unsigned int>(vtkIndices[j + 2]));
        }

        // Allocate and copy mesh to GPU
        float3* d_vertices;
        uint3* d_indices;
        float3* d_normals;
        unsigned short* d_rayHits;

        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_vertices), sizeof(float3) * numVertices));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_indices), sizeof(uint3) * numTriangles));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_normals), sizeof(float3) * numVertices));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_rayHits), sizeof(unsigned short) * numVertices));

        CUDA_CHECK(cudaMemcpyAsync(d_vertices, cpuVertices, sizeof(float3) * numVertices, cudaMemcpyHostToDevice,
                                   aoState.stream));
        CUDA_CHECK(cudaMemcpyAsync(d_indices, cpuIndices, sizeof(uint3) * numTriangles, cudaMemcpyHostToDevice,
                                   aoState.stream));
        CUDA_CHECK(cudaMemcpyAsync(d_normals, cpuNormals, sizeof(float3) * numVertices, cudaMemcpyHostToDevice,
                                   aoState.stream));
        CUDA_CHECK(cudaMemsetAsync(d_rayHits, 0, sizeof(unsigned short) * numVertices, aoState.stream));

        mesh.vertices = d_vertices;
        mesh.indices = d_indices;
        mesh.normals = d_normals;
        mesh.rayHits = d_rayHits;

        free(cpuVertices);
        free(cpuNormals);
        free(cpuIndices);

        return mesh;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // OptiX Code
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////

    struct Params
    {
        OptixMesh mesh;
        OptixTraversableHandle gasHandle;
        unsigned short aoSamples;
    };

    struct AoState
    {
        OptixDeviceContext context = nullptr;

        OptixPipelineCompileOptions pipelineCompileOptions = {};
        OptixPipelineLinkOptions pipelineLinkOptions = {};

        OptixModule module = nullptr;
        OptixPipeline pipeline = nullptr;

        OptixProgramGroup raygenProgramGroup = nullptr;
        OptixProgramGroup hitgroupProgramGroup = nullptr;
        OptixProgramGroup missProgramGroup = nullptr;

        Params params = {};
        Params* d_params = nullptr;
        OptixShaderBindingTable sbt = {};

        CUdeviceptr d_gasOutputBuffer = 0;

        CUstream stream{};
    };

    template<typename T>
    struct SbtRecord
    {
        __align__(OPTIX_SBT_RECORD_ALIGNMENT)
        char header[OPTIX_SBT_RECORD_HEADER_SIZE];

        T data;
    };

    /**
     * Initialize CUDA and create OptiX context
     */
    OptixDeviceContext initializeOptixContext()
    {
        // Initialize CUDA
        CUDA_CHECK(cudaFree(nullptr));

        // Initialize the OptiX API, loading all API entry points
        OPTIX_CHECK(optixInit());

        // Specify context options
        OptixDeviceContextOptions options = {};
        options.logCallbackFunction = &optixContextLogCallback;
        options.logCallbackLevel = 4;

        // Associate a CUDA context (and therefore a specific GPU) with this
        // device context
        CUcontext cuContext = nullptr;  // zero means take the current context
        OptixDeviceContext context = nullptr;
        OPTIX_CHECK(optixDeviceContextCreate(cuContext, &options, &context));

        OPTIX_CHECK(optixDeviceContextSetCacheLocation(context, "./OptiXCache"));

        return context;
    }


    void buildGeometryAccelerationStructure(AoState& aoState)
    {
        // Options for building the geometry acceleration structure
        OptixAccelBuildOptions accelerationOptions = {};
        accelerationOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
        accelerationOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

        // Build geometry acceleration structure
        OptixBuildInput buildInput = {};
        const uint32_t buildInputTriangleFlags[1] = {OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT};
        buildInput.triangleArray.flags = buildInputTriangleFlags;
        buildInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;

        buildInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; // TODO check structure in OptixMesh
        buildInput.triangleArray.vertexStrideInBytes = sizeof(float3);
        buildInput.triangleArray.numVertices = aoState.params.mesh.numVertices;
        buildInput.triangleArray.vertexBuffers = reinterpret_cast<CUdeviceptr*>(&aoState.params.mesh.vertices);

        buildInput.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; // TODO check structure in OptixMesh
        buildInput.triangleArray.indexStrideInBytes = sizeof(uint3);
        buildInput.triangleArray.indexBuffer = reinterpret_cast<CUdeviceptr>(&aoState.params.mesh.indices);
        buildInput.triangleArray.numIndexTriplets = aoState.params.mesh.numTriangles;

        buildInput.triangleArray.numSbtRecords = 1;

        // Compute geometry acceleration structure buffer sizes
        OptixAccelBufferSizes gasBufferSizes;
        OPTIX_CHECK(optixAccelComputeMemoryUsage(aoState.context, &accelerationOptions, &buildInput, 1,
                                                 &gasBufferSizes));

        // Allocate memory for geometry acceleration structure
        CUdeviceptr d_gasTempBuffer;
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_gasTempBuffer), gasBufferSizes.tempSizeInBytes));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&aoState.d_gasOutputBuffer), gasBufferSizes.outputSizeInBytes));

        CUDA_SYNC_CHECK();

        // Build the geometry acceleration structure
        OPTIX_CHECK(optixAccelBuild(aoState.context, aoState.stream, &accelerationOptions, &buildInput, 1,
                                    d_gasTempBuffer, gasBufferSizes.tempSizeInBytes, aoState.d_gasOutputBuffer,
                                    gasBufferSizes.outputSizeInBytes, &aoState.params.gasHandle, nullptr, 0));
        CUDA_SYNC_CHECK();

        // Deallocate temp space
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(d_gasTempBuffer)));
    }

    void createAoOptixModule(AoState& aoState)
    {
        char log[1024];
        size_t sizeof_log = sizeof(log);

        OptixModuleCompileOptions moduleCompileOptions = {};
        moduleCompileOptions.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
        moduleCompileOptions.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
        moduleCompileOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; // TODO remove debug

        // Pipeline options must be consistent for all modules used in a single pipeline
        aoState.pipelineCompileOptions.usesMotionBlur = false;
        aoState.pipelineCompileOptions.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
        aoState.pipelineCompileOptions.numPayloadValues = 0; // TODO check payload number
        aoState.pipelineCompileOptions.numAttributeValues = 2; // TODO maybe remove
        aoState.pipelineCompileOptions.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; // TODO maybe remove
        aoState.pipelineCompileOptions.exceptionFlags =
                OPTIX_EXCEPTION_FLAG_DEBUG | OPTIX_EXCEPTION_FLAG_TRACE_DEPTH |
                OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW; // TODO debug
        aoState.pipelineCompileOptions.pipelineLaunchParamsVariableName = "params";

        // Read AO ptx file
        std::ifstream aoPtxFile("./ao.ptx");
        std::string aoPtxCode;
        if (aoPtxFile.good())
        {
            std::stringstream fileBuffer;
            fileBuffer << aoPtxFile.rdbuf();
            aoPtxCode = fileBuffer.str();
        }
        else
        {
            std::cerr << "Fatal: Could not read ao.ptx. This is required for computing GPU AO.n";
        }
        aoPtxFile.close();

        OPTIX_CHECK(optixModuleCreateFromPTX(aoState.context, &moduleCompileOptions, &aoState.pipelineCompileOptions,
                                             aoPtxCode.c_str(), aoPtxCode.size(), log, &sizeof_log, &aoState.module));
    }

    void createAoProgramGroups(AoState& aoState)
    {
        char log[2048];
        size_t sizeof_log = sizeof(log);

        OptixProgramGroupOptions programGroupOptions = {};

        // Create raygen program group
        OptixProgramGroupDesc raygenProgramGroupDesc = {};
        raygenProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
        raygenProgramGroupDesc.raygen.module = aoState.module;
        raygenProgramGroupDesc.raygen.entryFunctionName = "__raygen__ao";

        OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &raygenProgramGroupDesc, 1, &programGroupOptions, log,
                                            &sizeof_log, &aoState.raygenProgramGroup));

        // Create miss program group
        OptixProgramGroupDesc missProgramGroupDesc = {};
        missProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
        missProgramGroupDesc.miss.module = aoState.module;
        missProgramGroupDesc.miss.entryFunctionName = "__miss__ao";

        OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &missProgramGroupDesc, 1, &programGroupOptions, log,
                                            &sizeof_log, &aoState.missProgramGroup));

        // Create hit-group (includes closest-hit) program group
        OptixProgramGroupDesc hitgroupProgramGroupDesc = {};
        hitgroupProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
        hitgroupProgramGroupDesc.hitgroup.moduleCH = aoState.module;
        hitgroupProgramGroupDesc.hitgroup.entryFunctionNameCH = "__closesthit__ao";

        OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &hitgroupProgramGroupDesc, 1, &programGroupOptions, log,
                                            &sizeof_log, &aoState.hitgroupProgramGroup));
    }

    void createAoPipeline(AoState& aoState)
    {
        char log[2048];
        size_t sizeof_log = sizeof(log);

        // Complete program group for pipeline
        OptixProgramGroup pipelineProgramGroups[] = {aoState.raygenProgramGroup, aoState.missProgramGroup,
                                                     aoState.hitgroupProgramGroup};
        unsigned int totalProgramGroups = sizeof(pipelineProgramGroups) / sizeof(pipelineProgramGroups[0]);

        // Create OptiX AO pipeline
        aoState.pipelineLinkOptions.maxTraceDepth = 1;
        aoState.pipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;

        OPTIX_CHECK(optixPipelineCreate(aoState.context, &aoState.pipelineCompileOptions,
                                        &aoState.pipelineLinkOptions, pipelineProgramGroups, totalProgramGroups,
                                        log, &sizeof_log, &aoState.pipeline));
    }

    void createAoSbt(AoState& aoState)
    {
        ////////////////////////////////////////////////
        // Raygen SBT Record
        ////////////////////////////////////////////////
        struct RayGenData
        { /* No data */ };

        using RayGenSbtRecord = SbtRecord<RayGenData>;
        const size_t raygenSbtRecordSize = sizeof(RayGenSbtRecord);

        // Allocate raygen SBT record
        CUdeviceptr d_raygenSbtRecord;
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygenSbtRecord), raygenSbtRecordSize));

        RayGenSbtRecord raygenSbtRecord;
        OPTIX_CHECK(optixSbtRecordPackHeader(aoState.raygenProgramGroup, &raygenSbtRecord));
        CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_raygenSbtRecord), &raygenSbtRecord, raygenSbtRecordSize,
                                   cudaMemcpyHostToDevice, aoState.stream));

        ////////////////////////////////////////////////
        // Miss SBT Record
        ////////////////////////////////////////////////
        struct MissGroupData
        { /* No data */ };

        using MissSbtRecord = SbtRecord<MissGroupData>;
        const size_t missSbtRecordSize = sizeof(MissSbtRecord);

        CUdeviceptr d_missSbtRecord;
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_missSbtRecord), missSbtRecordSize));

        MissSbtRecord missSbtRecord;
        OPTIX_CHECK(optixSbtRecordPackHeader(aoState.missProgramGroup, &missSbtRecord));
        CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_missSbtRecord), &missSbtRecord, missSbtRecordSize,
                                   cudaMemcpyHostToDevice, aoState.stream));

        ////////////////////////////////////////////////
        // Hitgroup SBT Record
        ////////////////////////////////////////////////
        struct HitGroupData
        { /* No data */ };

        using HitGroupSbtRecord = SbtRecord<HitGroupData>;
        const size_t hitgroupSbtRecordSize = sizeof(HitGroupSbtRecord);

        CUdeviceptr d_hitgroupSbtRecord;
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroupSbtRecord), hitgroupSbtRecordSize));

        HitGroupSbtRecord hitgroupSbtRecord;
        OPTIX_CHECK(optixSbtRecordPackHeader(aoState.hitgroupProgramGroup, &hitgroupSbtRecord));
        CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_hitgroupSbtRecord), &hitgroupSbtRecord,
                                   hitgroupSbtRecordSize, cudaMemcpyHostToDevice, aoState.stream));

        // Create Shader Binding Table
        OptixShaderBindingTable sbt = {};
        sbt.raygenRecord = d_raygenSbtRecord;
        sbt.missRecordBase = d_missSbtRecord;
        sbt.missRecordStrideInBytes = missSbtRecordSize;
        sbt.missRecordCount = 1;
        sbt.hitgroupRecordBase = d_hitgroupSbtRecord;
        sbt.hitgroupRecordStrideInBytes = hitgroupSbtRecordSize;
        sbt.hitgroupRecordCount = 1;
        aoState.sbt = sbt;
    }

    void launchAoPipeline(AoState& aoState)
    {
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&aoState.d_params), sizeof(Params)));
        CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(aoState.d_params), &aoState.params, sizeof(Params),
                                   cudaMemcpyHostToDevice, aoState.stream));

        // Launch pipeline
        OPTIX_CHECK(optixLaunch(aoState.pipeline, aoState.stream, reinterpret_cast<CUdeviceptr>(aoState.d_params),
                                sizeof(Params), &aoState.sbt, aoState.params.mesh.numVertices, 1, 1));
        CUDA_SYNC_CHECK();
    }

    void GpuAmbientOcclusion::compute(vtkSmartPointer<vtkPolydata> input, unsigned short aoSamples)
    {
        auto normals = computeNormals();

        AoState aoState = {};
        aoState.params.aoSamples = aoSamples;

        // Create GPU mesh
        CUDA_CHECK(cudaStreamCreate(&aoState.stream));
        aoState.params.mesh = optixMeshFromVtkPolydata(input, normals, aoState);
        std::cout << "COPIED MESHn";

        // Initialize OptiX context
        aoState.context = initializeOptixContext();
        std::cout << "INITIALIZED OPTIX CONTEXTn";

        // Create geometry acceleration structure
        buildGeometryAccelerationStructure(aoState);
        std::cout << "BUILT ACCELERATION STRUCTUREn";

        // Create AO module
        createAoOptixModule(aoState);
        std::cout << "BUILT AO MODULEn";

        // Create AO program groups
        createAoProgramGroups(aoState);
        std::cout << "BUILT AO PROGRAM GROUPSn";

        // Create AO pipeline
        createAoPipeline(aoState);
        std::cout << "BUILT AO PIPELINEn";

        // Create Shader Binding Table
        createAoSbt(aoState);
        std::cout << "BUILT AO SBTn";

        // Launch AO pipeline
        launchAoPipeline(aoState);
        std::cout << "LAUNCHED AO PIPELINEn";

        // Deallocate state
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.raygenRecord)));
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.missRecordBase)));
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.hitgroupRecordBase)));
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.d_gasOutputBuffer)));
        CUDA_CHECK(cudaStreamDestroy(aoState.stream));

        OPTIX_CHECK(optixPipelineDestroy(aoState.pipeline));
        OPTIX_CHECK(optixProgramGroupDestroy(aoState.hitgroupProgramGroup));
        OPTIX_CHECK(optixProgramGroupDestroy(aoState.missProgramGroup));
        OPTIX_CHECK(optixProgramGroupDestroy(aoState.raygenProgramGroup));
        OPTIX_CHECK(optixModuleDestroy(aoState.module));
        OPTIX_CHECK(optixDeviceContextDestroy(aoState.context));

        CUDA_CHECK(cudaFree(aoState.params.mesh.vertices));
        CUDA_CHECK(cudaFree(aoState.params.mesh.indices));
        CUDA_CHECK(cudaFree(aoState.params.mesh.normals));
        CUDA_CHECK(cudaFree(aoState.params.mesh.rayHits));
    }

The problem that I am facing is that when I try to buildGeometryAccelerationStructure, I run into Cuda/Optix memory issues at these steps.

        // Build the geometry acceleration structure
        OPTIX_CHECK(optixAccelBuild(aoState.context, aoState.stream, &accelerationOptions, &buildInput, 1,
                                    d_gasTempBuffer, gasBufferSizes.tempSizeInBytes, aoState.d_gasOutputBuffer,
                                    gasBufferSizes.outputSizeInBytes, &aoState.params.gasHandle, nullptr, 0));
        CUDA_SYNC_CHECK();

        // Deallocate temp space
        CUDA_CHECK(cudaFree(reinterpret_cast<void*>(d_gasTempBuffer)));

If I don’t comment out the CUDA_SYNC_CHECK and cudaFree steps, I am getting the error CUDA error on synchronize with error 'an illegal memory access was encountered' on the CUDA_SYNC_CHECK step. If I comment out the CUDA_SYNC_CHECK and cudaFree steps, I get the error OPTIX_ERROR_PIPELINE_LINK_ERROR: Optix call 'optixPipelineCreate(aoState.context, &aoState.pipelineCompileOptions, &aoState.pipelineLinkOptions, pipelineProgramGroups, totalProgramGroups, log, &sizeof_log, &aoState.pipeline) If I also comment out the optixAccelBuild function, the pipeline runs but I can’t really do anything without it since I can not optixTrace anything without the BVH data structure that the optixAccelBuild is creating.

Could you explain to me why do I have this illegal memory accessed issue? Any help would be really appreciated. Thanks in advance!

Source: Windows Questions C++

LEAVE A COMMENT