#include "kmeans.h" #include #include #include #include #include #include #ifdef WIN #include #else #include #include double gettime() { struct timeval t; gettimeofday(&t, NULL); return t.tv_sec + t.tv_usec * 1e-6; } #endif #ifdef NV #include #else #include #endif #ifndef FLT_MAX #define FLT_MAX 3.40282347e+38 #endif #ifdef RD_WG_SIZE_0_0 #define BLOCK_SIZE RD_WG_SIZE_0_0 #elif defined(RD_WG_SIZE_0) #define BLOCK_SIZE RD_WG_SIZE_0 #elif defined(RD_WG_SIZE) #define BLOCK_SIZE RD_WG_SIZE #else #define BLOCK_SIZE 256 #endif #ifdef RD_WG_SIZE_1_0 #define BLOCK_SIZE2 RD_WG_SIZE_1_0 #elif defined(RD_WG_SIZE_1) #define BLOCK_SIZE2 RD_WG_SIZE_1 #elif defined(RD_WG_SIZE) #define BLOCK_SIZE2 RD_WG_SIZE #else #define BLOCK_SIZE2 256 #endif // local variables static cl_context context; static cl_command_queue cmd_queue; static cl_device_type device_type; static cl_device_id *device_list; static cl_int num_devices; static int initialize(int use_gpu) { cl_int result; size_t size; /*// create OpenCL context cl_platform_id platform_id; if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(1,*,0) failed\n"); return -1; } cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0}; device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU; context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL); if (!context) { printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU"); return -1; } // get the list of GPUs result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); num_devices = (int)(size / sizeof(cl_device_id)); if (result != CL_SUCCESS || num_devices < 1) { printf("ERROR: clGetContextInfo() failed\n"); return -1; } device_list = new cl_device_id[num_devices]; if (!device_list) { printf("ERROR: new cl_device_id[] failed\n"); return -1; } result = clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL); if (result != CL_SUCCESS) { printf("ERROR: clGetContextInfo() failed\n"); return -1; }*/ cl_platform_id platform_id; num_devices = 1; device_list = new cl_device_id[num_devices]; result = clGetPlatformIDs(1, &platform_id, NULL); result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL); context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result); // create command queue for the first device cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL); if (!cmd_queue) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; } return 0; } static int shutdown() { // release resources if (cmd_queue) clReleaseCommandQueue(cmd_queue); if (context) clReleaseContext(context); if (device_list) delete device_list; // reset all variables cmd_queue = 0; context = 0; device_list = 0; num_devices = 0; device_type = 0; return 0; } cl_mem d_feature; cl_mem d_feature_swap; cl_mem d_cluster; cl_mem d_membership; cl_kernel kernel; cl_kernel kernel_s; cl_kernel kernel2; int *membership_OCL; int *membership_d; float *feature_d; float *clusters_d; float *center_d; static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { if (nullptr == filename || nullptr == data || 0 == size) return -1; FILE* fp = fopen(filename, "r"); if (NULL == fp) { fprintf(stderr, "Failed to load kernel."); return -1; } fseek(fp , 0 , SEEK_END); long fsize = ftell(fp); rewind(fp); *data = (uint8_t*)malloc(fsize); *size = fread(*data, 1, fsize, fp); fclose(fp); return 0; } int allocate(int n_points, int n_features, int n_clusters, float **feature) { /*int sourcesize = 1024 * 1024; char *source = (char *)calloc(sourcesize, sizeof(char)); if (!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char *tempchar = "./kmeans.cl"; FILE *fp = fopen(tempchar, "rb"); if (!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp);*/ // OpenCL initialization int use_gpu = 1; if (initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; //const char *slist[2] = {source, 0}; //cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors // static char log[65536]; memset(log, 0, sizeof(log)); // cl_device_id device_id = 0; // err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), //&device_id, NULL); // clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, // sizeof(log)-1, log, NULL); // if(err || strstr(log,"warning:") || strstr(log, "error:")) // printf("<<<<\n%s\n>>>>\n", log); } if (err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } char *kernel_kmeans_c = "kmeans_kernel_c"; char *kernel_swap = "kmeans_swap"; kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; } kernel2 = clCreateKernel(prog, kernel_swap, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n", n_points * n_features, err); return -1; } d_feature_swap = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n", n_points * n_features, err); return -1; } d_cluster = clCreateBuffer(context, CL_MEM_READ_WRITE, n_clusters * n_features * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n", n_clusters * n_features, err); return -1; } d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * sizeof(int), NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points, err); return -1; } // write buffers err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0, n_points * n_features * sizeof(float), feature[0], 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n", n_points * n_features, err); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature); clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap); clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features); size_t global_work[3] = {n_points, 1, 1}; /// Ke Wang adjustable local group size 2013/08/07 10:37:33 size_t local_work_size = BLOCK_SIZE; // work group size is defined by // RD_WG_SIZE_0 or RD_WG_SIZE_0_0 // 2014/06/10 17:00:51 if (global_work[0] % local_work_size != 0) global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size; err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work, &local_work_size, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } membership_OCL = (int *)malloc(n_points * sizeof(int)); } void deallocateMemory() { clReleaseMemObject(d_feature); clReleaseMemObject(d_feature_swap); clReleaseMemObject(d_cluster); clReleaseMemObject(d_membership); free(membership_OCL); } int main(int argc, char **argv) { printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n", BLOCK_SIZE, BLOCK_SIZE2); setup(argc, argv); shutdown(); } int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */ int n_features, int n_points, int n_clusters, int *membership, float **clusters, int *new_centers_len, float **new_centers) { int delta = 0; int i, j, k; cl_int err = 0; size_t global_work[3] = {n_points, 1, 1}; /// Ke Wang adjustable local group size 2013/08/07 10:37:33 size_t local_work_size = BLOCK_SIZE2; // work group size is defined by // RD_WG_SIZE_1 or RD_WG_SIZE_1_0 // 2014/06/10 17:00:41 if (global_work[0] % local_work_size != 0) global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size; err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0, n_clusters * n_features * sizeof(float), clusters[0], 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points, err); return -1; } int size = 0; int offset = 0; clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap); clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster); clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership); clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points); clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters); clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features); clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset); clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size); err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work, &local_work_size, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } clFinish(cmd_queue); err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0, n_points * sizeof(int), membership_OCL, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } delta = 0; for (i = 0; i < n_points; i++) { int cluster_id = membership_OCL[i]; new_centers_len[cluster_id]++; if (membership_OCL[i] != membership[i]) { delta++; membership[i] = membership_OCL[i]; } for (j = 0; j < n_features; j++) { new_centers[cluster_id][j] += feature[i][j]; } } return delta; }