clGetProgramBuildInfo does not return log info under ubuntu (Quadro P400)

  build, c++, opencl, ubuntu

The testing code below runs just fine under Windows using NVIDIA GeForce GTX 960M but under Ubuntu with Quadro P4000 video card, it reports a building error. I tried to get details for the error but I get no logs at all. I get the same error no matter if I use an external kernel file saxpy.cl or inline code.

I can confirm that under windows, the logs work just fine. So, what might be the root of the problem for the Ubuntu environment? Bad Cuda installation maybe?

clinfo under ubuntu can indeed find the NVidia Quadro card as you can see at the end of this question.

The error I get is

[kernel loaded. Read 208 bytes]
[Number of platforms = 1]
[Device ID selected = 0]
[Using GPU device = Quadro P4000]


************* CL_BUILD_PROGRAM_FAILURE ********************
[Error Description]


*************************************************************
ERROR: clCreateKernel ( -45 )
ERROR: clSetKernelArg 0 ( -48 )
ERROR: clSetKernelArg 1 ( -48 )
ERROR: clSetKernelArg 2 ( -48 )
ERROR: clSetKernelArg 3 ( -48 )
ERROR: clEnqueueNDRangeKernel ( -48 )
0.000000 + 16.000000 = 0.000000
1.000000 + 15.000000 = 0.000000
2.000000 + 14.000000 = 0.000000
3.000000 + 13.000000 = 0.000000
4.000000 + 12.000000 = 0.000000
5.000000 + 11.000000 = 0.000000
6.000000 + 10.000000 = 0.000000
7.000000 + 9.000000 = 0.000000
8.000000 + 8.000000 = 0.000000
9.000000 + 7.000000 = 0.000000
10.000000 + 6.000000 = 0.000000
11.000000 + 5.000000 = 0.000000
12.000000 + 4.000000 = 0.000000
13.000000 + 3.000000 = 0.000000
14.000000 + 2.000000 = 0.000000
15.000000 + 1.000000 = 0.000000

The full source of the testing program is that below

#define CL_TARGET_OPENCL_VERSION 120
//#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#define MAX_SOURCE_SIZE (0x100000)
#define _CRT_SECURE_NO_WARNINGS

#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define VECTOR_SIZE 16

//OpenCL kernel which is run for every work item created.
const char *saxpy_kernel =
"__kernel                                   n"
"void saxpy_kernel(float alpha,     n"
"                  __global float *A,       n"
"                  __global float *B,       n"
"                  __global float *C)       n"
"{                                          n"
"    //Get the index of the work-item       n"
"    int index = get_global_id(0);          n"
"    C[index] = A[index] + B[index]; n"
"}                                          n";

inline bool checkError(cl_int err, const char * name)
{
    bool success = true;
    if (err != CL_SUCCESS) {
        fprintf(stderr, "ERROR: %s ( %d )n", name, err);
        success = false;
    }
    return success;
}

int main(void) {


    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("saxpy.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    fprintf(stderr, "[kernel loaded. Read %d bytes]n", (unsigned int)source_size);

    int i;
    // Allocate space for vectors A, B and C
    float alpha = 1.0;
    float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    for (i = 0; i < VECTOR_SIZE; i++)
    {
        A[i] = i;
        B[i] = VECTOR_SIZE - i;
        C[i] = 0;
    }
    cl_int clStatus;
    // Get platform and device information
    //cl_platform_id * platforms = NULL;
    //cl_uint     num_platforms;
    //Set up the Platform
    //cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
    //platforms = (cl_platform_id *)
    //  malloc(sizeof(cl_platform_id)*num_platforms);
    //clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
    //checkError(clStatus, "clGetPlatformIDs");

    ////Get the devices list and choose the device you want to run on
    //cl_device_id     *device_list = NULL;
    //cl_uint           num_devices;

    //clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    //device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
    //clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
    //checkError(clStatus, "clGetDeviceIDs");


    bool deviceFound = true;
    cl_uint                 num_platforms;           ///< Number of platforms found in host
    cl_platform_id*         platforms;            ///< Information of platforms found
    cl_device_id*           device_list;              ///< Information of Devices found
    cl_uint                 num_devices;             ///< Number of devices found that support OpenCL
    size_t                  paramValueSize;         ///< Info Device
    char *info = NULL;

    int deviceId = 0;
    // Select an OpenCL platform to run on.  
    clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
    fprintf(stderr, "[Number of platforms = %d]n", (unsigned int)num_platforms);
    deviceFound = checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");

    platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);

    clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
    checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");
    device_list = NULL;
    if (deviceFound)
    {
        // Iterate through the list of platforms until we find one that supports
        // a GPU device, otherwise fail with an error.


        cl_uint i;
        for (i = 0; i < num_platforms; i++)
        {
            //Find only GPU Devices
            clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
            if (clStatus != CL_SUCCESS && clStatus != CL_DEVICE_NOT_FOUND)
            {
                deviceFound = checkError(clStatus, "No devices Found");

            }

            //Construct a vector with devices found
            else if (num_devices > 0)
            {
                device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
                clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices, &device_list[0], NULL);
                deviceFound = checkError(clStatus, "No devices found");
                deviceId = i;
                fprintf(stderr, "[Device ID selected = %d]n", deviceId);

                break;
            }
        }

        //if found device, display information
        if (deviceFound)
        {
            if (deviceId < 0 || deviceId >(num_devices - 1))
            {
                deviceId = 0;
                printf("ID device not found, use default GPU device n");
            }

            //Obtain device vendor name to display info   
            clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, 0, NULL, &paramValueSize);
            checkError(clStatus, "Failed to find OpenCL device info");


            info = (char *)malloc(sizeof(char) * paramValueSize);  // String to display vendor name info
            clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, paramValueSize, info, NULL);
            checkError(clStatus, "Failed to find OpenCL device info");



        }
    }
    if (deviceFound)
        fprintf(stderr, "[Using GPU device = %s]nn", info);
    free(info);
    


    // Create one OpenCL context for each device in the platform
    cl_context context;
    context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
    checkError(clStatus, "clCreateContext");

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, &clStatus);
    checkError(clStatus, "clCreateCommandQueue");


    //cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES,  0 , 0 };
    //cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[0], qprop, &clStatus);

    // Create memory buffers on the device for each vector
    cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
    checkError(clStatus, "clCreateBuffer A");
    cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
    checkError(clStatus, "clCreateBuffer B");
    cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
    checkError(clStatus, "clCreateBuffer C");

    // Copy the Buffer A and B to the device
    clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
    clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);

    // Create a program from the kernel source
    //cl_program program = clCreateProgramWithSource(context, 1, (const char **)&saxpy_kernel, NULL, &clStatus);
    //checkError(clStatus, "clCreateProgramWithSource");    
    // Build the program
    //clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
    //checkError(clStatus, "clBuildProgram");
    // Create the OpenCL kernel
    //cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
    //checkError(clStatus, "clCreateKernel");

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,  (const char **)&source_str, (const size_t *)&source_size, &clStatus);
    checkError(clStatus, "clCreateProgramWithSource");   
    // Build the program
    clStatus = clBuildProgram(program, num_devices, device_list, NULL, NULL, NULL);
    
    // Query if compilation was successful  
    
    if (clStatus == CL_BUILD_PROGRAM_FAILURE) {

        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

        // Allocate memory for the log
        char *log = (char *)malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        fprintf(stderr, "n************* CL_BUILD_PROGRAM_FAILURE ********************n");
        fprintf(stderr, "[Error Description]nn%sn", log);
        fprintf(stderr, "*************************************************************n");
        free(log);
        
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
    checkError(clStatus, "clCreateKernel");

    // Set the arguments of the kernel
    clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
    checkError(clStatus, "clSetKernelArg 0");
    clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
    checkError(clStatus, "clSetKernelArg 1");
    clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
    checkError(clStatus, "clSetKernelArg 2");
    clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
    checkError(clStatus, "clSetKernelArg 3");

    // Execute the OpenCL kernel on the list
    size_t global_size = VECTOR_SIZE; // Process the entire lists
    size_t local_size = 16;           // Process one item at a time
    clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
    checkError(clStatus, "clEnqueueNDRangeKernel");

    // Read the cl memory C_clmem on device to the host variable C
    clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
    checkError(clStatus, "clEnqueueReadBuffer C");

    // Clean up and wait for all the comands to complete.
    clStatus = clFlush(command_queue);
    clStatus = clFinish(command_queue);

    // Display the result to the screen
    for (i = 0; i < VECTOR_SIZE; i++)
        printf("%f + %f = %fn", A[i], B[i], C[i]);
 

    // Finally release all OpenCL allocated objects and host buffers.
    clStatus = clReleaseKernel(kernel);
    clStatus = clReleaseProgram(program);
    clStatus = clReleaseMemObject(A_clmem);
    clStatus = clReleaseMemObject(B_clmem);
    clStatus = clReleaseMemObject(C_clmem);

    clStatus = clReleaseCommandQueue(command_queue);
    clStatus = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    free(platforms);
    free(device_list);
    return 0;
}

the saxpy.cl file

__kernel void saxpy_kernel(float alpha,__global float *A, __global float *B, __global float *C)
    {                                   
        int index = get_global_id(0);  
        C[index] = A[index] + B[index];
    }  
                           

You can also see the clinfo report below

Number of platforms                               1
  Platform Name                                   NVIDIA CUDA
  Platform Vendor                                 NVIDIA Corporation
  Platform Version                                OpenCL 1.2 CUDA 10.1.120
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer
  Platform Extensions function suffix             NV

  Platform Name                                   NVIDIA CUDA
Number of devices                                 1
  Device Name                                     Quadro P4000
  Device Vendor                                   NVIDIA Corporation
  Device Vendor ID                                0x10de
  Device Version                                  OpenCL 1.2 CUDA
  Driver Version                                  430.64
  Device OpenCL C Version                         OpenCL C 1.2
  Device Type                                     GPU
  Device Topology (NV)                            <printDeviceInfo:22: get CL_DEVICE_PCI_DOMAIN_ID_NV : error -30>
  Device Profile                                  FULL_PROFILE
  Device Available                                Yes
  Compiler Available                              Yes
  Linker Available                                Yes
  Max compute units                               14
  Max clock frequency                             1480MHz
  Compute Capability (NV)                         6.1
  Device Partition                                (core)
    Max number of sub-devices                     1
    Supported partition types                     None
    Supported affinity domains                    (n/a)
  Max work item dimensions                        3
  Max work item sizes                             1024x1024x64
  Max work group size 

                        1024

Source: Windows Questions C++

LEAVE A COMMENT