// create the OpenCL context on a GPU device
cl_context context = clCreateContextFromType(0,
                       CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
 
// get the list of GPU devices associated with context
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
 
cl_device_id[] devices = malloc(cb);
clGetContextInfo(context,CL_CONTEXT_DEVICES,cb,devices,NULL);
 
// create a command-queue
cmd_queue = clCreateCommandQueue(context,devices[0],0,NULL);
 
// allocate the buffer memory objects
memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY |
       CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*n, srcA, NULL);
memobjs[1] = clCreateBuffer(context, CL_MEM_READ_ONLY |
       CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*n, srcb, NULL);
 
memobjs[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
                             sizeof(cl_float)*n, NULL, NULL);
 
// create the program
program = clCreateProgramWithSource(context, 1,
                                &program_source, NULL, NULL);
// build the program
err = clBuildProgram(program, 0, NULL,NULL,NULL,NULL);
 
// create the kernel
kernel = clCreateKernel(program, “vec_add”, NULL);
 
// set the args values
err  = clSetKernelArg(kernel, 0, (void *) &memobjs[0], 
                         sizeof(cl_mem));
err |= clSetKernelArg(kernel, 1, (void *) &memobjs[1], 
                         sizeof(cl_mem));
err |= clSetKernelArg(kernel, 2, (void *) &memobjs[2], 
                         sizeof(cl_mem));
// set work-item dimensions
global_work_size[0] = n;
 
// execute kernel
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                    global_work_size, NULL,0,NULL,NULL);
 
// read output array
err = clEnqueueReadBuffer(cmd_queue, memobjs[2], 
                          CL_TRUE, 0,
                          n*sizeof(cl_float), dst,
                          0, NULL, NULL);