I am trying to implement baked ambient occlusion using Optix 7.2 on my GTX 1070 with CUDA 11.2.1-2 and GCC 10.2.0. So far I have the following code to load a vtkPolydata triangular mesh onto the GPU, and setup the Optix pipeline.
This is the cpp file of my GpuAmbientOcclusion class.
GpuAmbientOcclusion::GpuAmbientOcclusion()
{
}
/**
* OptiX context log callback function. Used by the OptiX context to log output.
*
* @param level Log level
* @param tag Log tag
* @param message Log message
*/
static void optixContextLogCallback(unsigned int level, const char* tag, const char* message, void* /*cbdata */)
{
std::cerr << "[" << std::setw(2) << level << "][" << std::setw(12) << tag << "]: "
<< message << "n";
}
struct OptixMesh
{
float3* vertices;
uint3* indices;
float3* normals;
unsigned short* rayHits;
unsigned int numVertices;
unsigned int numTriangles;
};
OptixMesh optixMeshFromVtkPolydata(const vtkSmartPointer<vtkPolyData>& polydata,
const vtkSmartPointer<vtkFloatArray>& normals, AoState& aoState)
{
const auto numVertices = static_cast<unsigned int>(polydata->GetNumberOfPoints());
const auto numTriangles = static_cast<unsigned int>(polydata->GetNumberOfCells());
// Allocate memory for OptiX mesh
OptixMesh mesh;
mesh.numVertices = numVertices;
mesh.numTriangles = numTriangles;
// Access VTK buffer arrays for fast copying
// Need to copy vtk vertices to new array casted as floats
const double* vtkVertices = vtkDoubleArray::SafeDownCast(polydata->GetPoints()->GetData())->GetPointer(0);
const float* vtkNormals = normals->GetPointer(0);
auto* cpuVertices = (float3*) malloc(sizeof(float3) * numVertices);
auto* cpuNormals = (float3*) malloc(sizeof(float3) * numVertices);
for (unsigned int i = 0, j = 0; i < numVertices; ++i, j += 3)
{
cpuVertices[i] = make_float3(static_cast<float>(vtkVertices[j]), static_cast<float>(vtkVertices[j + 1]),
static_cast<float>(vtkVertices[j + 2]));
cpuNormals[i] = make_float3(vtkNormals[j], vtkNormals[j + 1], vtkNormals[j + 2]);
}
// Need to copy vtk indices to new array casted as unsigned ints
const long long* vtkIndices = vtkTypeInt64Array::SafeDownCast(
polydata->GetPolys()->GetConnectivityArray())->GetPointer(0);
auto* cpuIndices = (uint3*) malloc(sizeof(uint3) * numTriangles);
for (unsigned int i = 0, j = 0; i < numTriangles; ++i, j += 3)
{
cpuIndices[i] = make_uint3(static_cast<unsigned int>(vtkIndices[j]),
static_cast<unsigned int>(vtkIndices[j + 1]),
static_cast<unsigned int>(vtkIndices[j + 2]));
}
// Allocate and copy mesh to GPU
float3* d_vertices;
uint3* d_indices;
float3* d_normals;
unsigned short* d_rayHits;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_vertices), sizeof(float3) * numVertices));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_indices), sizeof(uint3) * numTriangles));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_normals), sizeof(float3) * numVertices));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_rayHits), sizeof(unsigned short) * numVertices));
CUDA_CHECK(cudaMemcpyAsync(d_vertices, cpuVertices, sizeof(float3) * numVertices, cudaMemcpyHostToDevice,
aoState.stream));
CUDA_CHECK(cudaMemcpyAsync(d_indices, cpuIndices, sizeof(uint3) * numTriangles, cudaMemcpyHostToDevice,
aoState.stream));
CUDA_CHECK(cudaMemcpyAsync(d_normals, cpuNormals, sizeof(float3) * numVertices, cudaMemcpyHostToDevice,
aoState.stream));
CUDA_CHECK(cudaMemsetAsync(d_rayHits, 0, sizeof(unsigned short) * numVertices, aoState.stream));
mesh.vertices = d_vertices;
mesh.indices = d_indices;
mesh.normals = d_normals;
mesh.rayHits = d_rayHits;
free(cpuVertices);
free(cpuNormals);
free(cpuIndices);
return mesh;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// OptiX Code
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct Params
{
OptixMesh mesh;
OptixTraversableHandle gasHandle;
unsigned short aoSamples;
};
struct AoState
{
OptixDeviceContext context = nullptr;
OptixPipelineCompileOptions pipelineCompileOptions = {};
OptixPipelineLinkOptions pipelineLinkOptions = {};
OptixModule module = nullptr;
OptixPipeline pipeline = nullptr;
OptixProgramGroup raygenProgramGroup = nullptr;
OptixProgramGroup hitgroupProgramGroup = nullptr;
OptixProgramGroup missProgramGroup = nullptr;
Params params = {};
Params* d_params = nullptr;
OptixShaderBindingTable sbt = {};
CUdeviceptr d_gasOutputBuffer = 0;
CUstream stream{};
};
template<typename T>
struct SbtRecord
{
__align__(OPTIX_SBT_RECORD_ALIGNMENT)
char header[OPTIX_SBT_RECORD_HEADER_SIZE];
T data;
};
/**
* Initialize CUDA and create OptiX context
*/
OptixDeviceContext initializeOptixContext()
{
// Initialize CUDA
CUDA_CHECK(cudaFree(nullptr));
// Initialize the OptiX API, loading all API entry points
OPTIX_CHECK(optixInit());
// Specify context options
OptixDeviceContextOptions options = {};
options.logCallbackFunction = &optixContextLogCallback;
options.logCallbackLevel = 4;
// Associate a CUDA context (and therefore a specific GPU) with this
// device context
CUcontext cuContext = nullptr; // zero means take the current context
OptixDeviceContext context = nullptr;
OPTIX_CHECK(optixDeviceContextCreate(cuContext, &options, &context));
OPTIX_CHECK(optixDeviceContextSetCacheLocation(context, "./OptiXCache"));
return context;
}
void buildGeometryAccelerationStructure(AoState& aoState)
{
// Options for building the geometry acceleration structure
OptixAccelBuildOptions accelerationOptions = {};
accelerationOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
accelerationOptions.operation = OPTIX_BUILD_OPERATION_BUILD;
// Build geometry acceleration structure
OptixBuildInput buildInput = {};
const uint32_t buildInputTriangleFlags[1] = {OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT};
buildInput.triangleArray.flags = buildInputTriangleFlags;
buildInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
buildInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; // TODO check structure in OptixMesh
buildInput.triangleArray.vertexStrideInBytes = sizeof(float3);
buildInput.triangleArray.numVertices = aoState.params.mesh.numVertices;
buildInput.triangleArray.vertexBuffers = reinterpret_cast<CUdeviceptr*>(&aoState.params.mesh.vertices);
buildInput.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; // TODO check structure in OptixMesh
buildInput.triangleArray.indexStrideInBytes = sizeof(uint3);
buildInput.triangleArray.indexBuffer = reinterpret_cast<CUdeviceptr>(&aoState.params.mesh.indices);
buildInput.triangleArray.numIndexTriplets = aoState.params.mesh.numTriangles;
buildInput.triangleArray.numSbtRecords = 1;
// Compute geometry acceleration structure buffer sizes
OptixAccelBufferSizes gasBufferSizes;
OPTIX_CHECK(optixAccelComputeMemoryUsage(aoState.context, &accelerationOptions, &buildInput, 1,
&gasBufferSizes));
// Allocate memory for geometry acceleration structure
CUdeviceptr d_gasTempBuffer;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_gasTempBuffer), gasBufferSizes.tempSizeInBytes));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&aoState.d_gasOutputBuffer), gasBufferSizes.outputSizeInBytes));
CUDA_SYNC_CHECK();
// Build the geometry acceleration structure
OPTIX_CHECK(optixAccelBuild(aoState.context, aoState.stream, &accelerationOptions, &buildInput, 1,
d_gasTempBuffer, gasBufferSizes.tempSizeInBytes, aoState.d_gasOutputBuffer,
gasBufferSizes.outputSizeInBytes, &aoState.params.gasHandle, nullptr, 0));
CUDA_SYNC_CHECK();
// Deallocate temp space
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(d_gasTempBuffer)));
}
void createAoOptixModule(AoState& aoState)
{
char log[1024];
size_t sizeof_log = sizeof(log);
OptixModuleCompileOptions moduleCompileOptions = {};
moduleCompileOptions.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
moduleCompileOptions.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
moduleCompileOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; // TODO remove debug
// Pipeline options must be consistent for all modules used in a single pipeline
aoState.pipelineCompileOptions.usesMotionBlur = false;
aoState.pipelineCompileOptions.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
aoState.pipelineCompileOptions.numPayloadValues = 0; // TODO check payload number
aoState.pipelineCompileOptions.numAttributeValues = 2; // TODO maybe remove
aoState.pipelineCompileOptions.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; // TODO maybe remove
aoState.pipelineCompileOptions.exceptionFlags =
OPTIX_EXCEPTION_FLAG_DEBUG | OPTIX_EXCEPTION_FLAG_TRACE_DEPTH |
OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW; // TODO debug
aoState.pipelineCompileOptions.pipelineLaunchParamsVariableName = "params";
// Read AO ptx file
std::ifstream aoPtxFile("./ao.ptx");
std::string aoPtxCode;
if (aoPtxFile.good())
{
std::stringstream fileBuffer;
fileBuffer << aoPtxFile.rdbuf();
aoPtxCode = fileBuffer.str();
}
else
{
std::cerr << "Fatal: Could not read ao.ptx. This is required for computing GPU AO.n";
}
aoPtxFile.close();
OPTIX_CHECK(optixModuleCreateFromPTX(aoState.context, &moduleCompileOptions, &aoState.pipelineCompileOptions,
aoPtxCode.c_str(), aoPtxCode.size(), log, &sizeof_log, &aoState.module));
}
void createAoProgramGroups(AoState& aoState)
{
char log[2048];
size_t sizeof_log = sizeof(log);
OptixProgramGroupOptions programGroupOptions = {};
// Create raygen program group
OptixProgramGroupDesc raygenProgramGroupDesc = {};
raygenProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
raygenProgramGroupDesc.raygen.module = aoState.module;
raygenProgramGroupDesc.raygen.entryFunctionName = "__raygen__ao";
OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &raygenProgramGroupDesc, 1, &programGroupOptions, log,
&sizeof_log, &aoState.raygenProgramGroup));
// Create miss program group
OptixProgramGroupDesc missProgramGroupDesc = {};
missProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
missProgramGroupDesc.miss.module = aoState.module;
missProgramGroupDesc.miss.entryFunctionName = "__miss__ao";
OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &missProgramGroupDesc, 1, &programGroupOptions, log,
&sizeof_log, &aoState.missProgramGroup));
// Create hit-group (includes closest-hit) program group
OptixProgramGroupDesc hitgroupProgramGroupDesc = {};
hitgroupProgramGroupDesc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
hitgroupProgramGroupDesc.hitgroup.moduleCH = aoState.module;
hitgroupProgramGroupDesc.hitgroup.entryFunctionNameCH = "__closesthit__ao";
OPTIX_CHECK(optixProgramGroupCreate(aoState.context, &hitgroupProgramGroupDesc, 1, &programGroupOptions, log,
&sizeof_log, &aoState.hitgroupProgramGroup));
}
void createAoPipeline(AoState& aoState)
{
char log[2048];
size_t sizeof_log = sizeof(log);
// Complete program group for pipeline
OptixProgramGroup pipelineProgramGroups[] = {aoState.raygenProgramGroup, aoState.missProgramGroup,
aoState.hitgroupProgramGroup};
unsigned int totalProgramGroups = sizeof(pipelineProgramGroups) / sizeof(pipelineProgramGroups[0]);
// Create OptiX AO pipeline
aoState.pipelineLinkOptions.maxTraceDepth = 1;
aoState.pipelineLinkOptions.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
OPTIX_CHECK(optixPipelineCreate(aoState.context, &aoState.pipelineCompileOptions,
&aoState.pipelineLinkOptions, pipelineProgramGroups, totalProgramGroups,
log, &sizeof_log, &aoState.pipeline));
}
void createAoSbt(AoState& aoState)
{
////////////////////////////////////////////////
// Raygen SBT Record
////////////////////////////////////////////////
struct RayGenData
{ /* No data */ };
using RayGenSbtRecord = SbtRecord<RayGenData>;
const size_t raygenSbtRecordSize = sizeof(RayGenSbtRecord);
// Allocate raygen SBT record
CUdeviceptr d_raygenSbtRecord;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygenSbtRecord), raygenSbtRecordSize));
RayGenSbtRecord raygenSbtRecord;
OPTIX_CHECK(optixSbtRecordPackHeader(aoState.raygenProgramGroup, &raygenSbtRecord));
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_raygenSbtRecord), &raygenSbtRecord, raygenSbtRecordSize,
cudaMemcpyHostToDevice, aoState.stream));
////////////////////////////////////////////////
// Miss SBT Record
////////////////////////////////////////////////
struct MissGroupData
{ /* No data */ };
using MissSbtRecord = SbtRecord<MissGroupData>;
const size_t missSbtRecordSize = sizeof(MissSbtRecord);
CUdeviceptr d_missSbtRecord;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_missSbtRecord), missSbtRecordSize));
MissSbtRecord missSbtRecord;
OPTIX_CHECK(optixSbtRecordPackHeader(aoState.missProgramGroup, &missSbtRecord));
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_missSbtRecord), &missSbtRecord, missSbtRecordSize,
cudaMemcpyHostToDevice, aoState.stream));
////////////////////////////////////////////////
// Hitgroup SBT Record
////////////////////////////////////////////////
struct HitGroupData
{ /* No data */ };
using HitGroupSbtRecord = SbtRecord<HitGroupData>;
const size_t hitgroupSbtRecordSize = sizeof(HitGroupSbtRecord);
CUdeviceptr d_hitgroupSbtRecord;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroupSbtRecord), hitgroupSbtRecordSize));
HitGroupSbtRecord hitgroupSbtRecord;
OPTIX_CHECK(optixSbtRecordPackHeader(aoState.hitgroupProgramGroup, &hitgroupSbtRecord));
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(d_hitgroupSbtRecord), &hitgroupSbtRecord,
hitgroupSbtRecordSize, cudaMemcpyHostToDevice, aoState.stream));
// Create Shader Binding Table
OptixShaderBindingTable sbt = {};
sbt.raygenRecord = d_raygenSbtRecord;
sbt.missRecordBase = d_missSbtRecord;
sbt.missRecordStrideInBytes = missSbtRecordSize;
sbt.missRecordCount = 1;
sbt.hitgroupRecordBase = d_hitgroupSbtRecord;
sbt.hitgroupRecordStrideInBytes = hitgroupSbtRecordSize;
sbt.hitgroupRecordCount = 1;
aoState.sbt = sbt;
}
void launchAoPipeline(AoState& aoState)
{
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&aoState.d_params), sizeof(Params)));
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void*>(aoState.d_params), &aoState.params, sizeof(Params),
cudaMemcpyHostToDevice, aoState.stream));
// Launch pipeline
OPTIX_CHECK(optixLaunch(aoState.pipeline, aoState.stream, reinterpret_cast<CUdeviceptr>(aoState.d_params),
sizeof(Params), &aoState.sbt, aoState.params.mesh.numVertices, 1, 1));
CUDA_SYNC_CHECK();
}
void GpuAmbientOcclusion::compute(vtkSmartPointer<vtkPolydata> input, unsigned short aoSamples)
{
auto normals = computeNormals();
AoState aoState = {};
aoState.params.aoSamples = aoSamples;
// Create GPU mesh
CUDA_CHECK(cudaStreamCreate(&aoState.stream));
aoState.params.mesh = optixMeshFromVtkPolydata(input, normals, aoState);
std::cout << "COPIED MESHn";
// Initialize OptiX context
aoState.context = initializeOptixContext();
std::cout << "INITIALIZED OPTIX CONTEXTn";
// Create geometry acceleration structure
buildGeometryAccelerationStructure(aoState);
std::cout << "BUILT ACCELERATION STRUCTUREn";
// Create AO module
createAoOptixModule(aoState);
std::cout << "BUILT AO MODULEn";
// Create AO program groups
createAoProgramGroups(aoState);
std::cout << "BUILT AO PROGRAM GROUPSn";
// Create AO pipeline
createAoPipeline(aoState);
std::cout << "BUILT AO PIPELINEn";
// Create Shader Binding Table
createAoSbt(aoState);
std::cout << "BUILT AO SBTn";
// Launch AO pipeline
launchAoPipeline(aoState);
std::cout << "LAUNCHED AO PIPELINEn";
// Deallocate state
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.raygenRecord)));
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.missRecordBase)));
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.sbt.hitgroupRecordBase)));
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(aoState.d_gasOutputBuffer)));
CUDA_CHECK(cudaStreamDestroy(aoState.stream));
OPTIX_CHECK(optixPipelineDestroy(aoState.pipeline));
OPTIX_CHECK(optixProgramGroupDestroy(aoState.hitgroupProgramGroup));
OPTIX_CHECK(optixProgramGroupDestroy(aoState.missProgramGroup));
OPTIX_CHECK(optixProgramGroupDestroy(aoState.raygenProgramGroup));
OPTIX_CHECK(optixModuleDestroy(aoState.module));
OPTIX_CHECK(optixDeviceContextDestroy(aoState.context));
CUDA_CHECK(cudaFree(aoState.params.mesh.vertices));
CUDA_CHECK(cudaFree(aoState.params.mesh.indices));
CUDA_CHECK(cudaFree(aoState.params.mesh.normals));
CUDA_CHECK(cudaFree(aoState.params.mesh.rayHits));
}
The problem that I am facing is that when I try to buildGeometryAccelerationStructure, I run into Cuda/Optix memory issues at these steps.
// Build the geometry acceleration structure
OPTIX_CHECK(optixAccelBuild(aoState.context, aoState.stream, &accelerationOptions, &buildInput, 1,
d_gasTempBuffer, gasBufferSizes.tempSizeInBytes, aoState.d_gasOutputBuffer,
gasBufferSizes.outputSizeInBytes, &aoState.params.gasHandle, nullptr, 0));
CUDA_SYNC_CHECK();
// Deallocate temp space
CUDA_CHECK(cudaFree(reinterpret_cast<void*>(d_gasTempBuffer)));
If I don’t comment out the CUDA_SYNC_CHECK and cudaFree steps, I am getting the error CUDA error on synchronize with error 'an illegal memory access was encountered'
on the CUDA_SYNC_CHECK step. If I comment out the CUDA_SYNC_CHECK and cudaFree steps, I get the error OPTIX_ERROR_PIPELINE_LINK_ERROR: Optix call 'optixPipelineCreate(aoState.context, &aoState.pipelineCompileOptions, &aoState.pipelineLinkOptions, pipelineProgramGroups, totalProgramGroups, log, &sizeof_log, &aoState.pipeline)
If I also comment out the optixAccelBuild function, the pipeline runs but I can’t really do anything without it since I can not optixTrace anything without the BVH data structure that the optixAccelBuild is creating.
Could you explain to me why do I have this illegal memory accessed
issue? Any help would be really appreciated. Thanks in advance!
Source: Windows Questions C++