#ifndef __GAUSSIAN_ELIMINATION__ #define __GAUSSIAN_ELIMINATION__ #include "gaussianElim.h" cl_context context = NULL; int main(int argc, char *argv[]) { printf("enter demo main\n"); float *a = NULL, *b = NULL, *finalVec = NULL; float *m = NULL; int size; FILE *fp; // args char filename[100]; int quiet = 0, timing = 0, platform = -1, device = -1; // parse command line if (parseCommandline(argc, argv, filename, &quiet, &timing, &platform, &device)) { printUsage(); return 0; } context = cl_init_context(platform, device, quiet); fp = fopen(filename, "r"); fscanf(fp, "%d", &size); a = (float *)malloc(size * size * sizeof(float)); printf("OK\n"); InitMat(fp, size, a, size, size); // printf("The input matrix a is:\n"); // PrintMat(a, size, size, size); b = (float *)malloc(size * sizeof(float)); InitAry(fp, b, size); // printf("The input array b is:\n"); // PrintAry(b, size); // create the solution matrix m = (float *)malloc(size * size * sizeof(float)); // create a new vector to hold the final answer finalVec = (float *)malloc(size * sizeof(float)); InitPerRun(size, m); // begin timing // run kernels ForwardSub(context, a, b, m, size, timing); // end timing if (!quiet) { printf("The result of matrix m is: \n"); PrintMat(m, size, size, size); printf("The result of matrix a is: \n"); PrintMat(a, size, size, size); printf("The result of array b is: \n"); PrintAry(b, size); BackSub(a, b, finalVec, size); printf("The final solution is: \n"); PrintAry(finalVec, size); } fclose(fp); free(m); free(a); free(b); free(finalVec); // OpenClGaussianElimination(context,timing); cl_cleanup(); printf("Passed!\n"); return 0; } /*------------------------------------------------------ ** ForwardSub() -- Forward substitution of Gaussian ** elimination. **------------------------------------------------------ */ void ForwardSub(cl_context context, float *a, float *b, float *m, int size, int timing) { // 1. set up kernels cl_kernel fan1_kernel, fan2_kernel; cl_int status = 0; cl_program gaussianElim_program; cl_event writeEvent, kernelEvent, readEvent; float writeTime = 0, readTime = 0, kernelTime = 0; float writeMB = 0, readMB = 0; gaussianElim_program = cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL); fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status); status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true); if (status) exit(1); fan2_kernel = clCreateKernel(gaussianElim_program, "Fan2", &status); status = cl_errChk(status, (char *)"Error Creating Fan2 kernel", true); if (status) exit(1); // 2. set up memory on device and send ipts data to device cl_mem a_dev, b_dev, m_dev; cl_int error = 0; a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size * size, NULL, &error); b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size, NULL, &error); m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size * size, NULL, &error); cl_command_queue command_queue = cl_getCommandQueue(); error = clEnqueueWriteBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, a, 0, NULL, &writeEvent); if (timing) writeTime += eventTime(writeEvent, command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size, b, 0, NULL, &writeEvent); if (timing) writeTime += eventTime(writeEvent, command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, &writeEvent); if (timing) writeTime += eventTime(writeEvent, command_queue); clReleaseEvent(writeEvent); writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); // 3. Determine block sizes size_t globalWorksizeFan1[1]; size_t globalWorksizeFan2[2]; globalWorksizeFan1[0] = size; globalWorksizeFan2[0] = size; globalWorksizeFan2[1] = size; int t; // 4. Setup and Run kernels for (t = 0; t < (size - 1); t++) { // kernel args cl_int argchk; argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk, "ERROR in Setting Fan1 kernel args", true); // launch kernel error = clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0, globalWorksizeFan1, NULL, 0, NULL, &kernelEvent); cl_errChk(error, "ERROR in Executing Fan1 Kernel", true); if (timing) { // printf("here1a\n"); kernelTime += eventTime(kernelEvent, command_queue); // printf("here1b\n"); } clReleaseEvent(kernelEvent); // Fan1<<>>(m_cuda,a_cuda,Size,t); // cudaThreadSynchronize(); // kernel args argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk, "ERROR in Setting Fan2 kernel args", true); // launch kernel error = clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0, globalWorksizeFan2, NULL, 0, NULL, &kernelEvent); cl_errChk(error, "ERROR in Executing Fan1 Kernel", true); if (timing) { // printf("here2a\n"); kernelTime += eventTime(kernelEvent, command_queue); // printf("here2b\n"); } clReleaseEvent(kernelEvent); // Fan2<<>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t); // cudaThreadSynchronize(); } // 5. transfer data off of device error = clEnqueueReadBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, a, 0, NULL, &readEvent); cl_errChk(error, "ERROR with clEnqueueReadBuffer", true); if (timing) readTime += eventTime(readEvent, command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size, b, 0, NULL, &readEvent); cl_errChk(error, "ERROR with clEnqueueReadBuffer", true); if (timing) readTime += eventTime(readEvent, command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, &readEvent); cl_errChk(error, "ERROR with clEnqueueReadBuffer", true); if (timing) readTime += eventTime(readEvent, command_queue); clReleaseEvent(readEvent); readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); if (timing) { printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) " "[size]\t\tTotal(s)\n"); printf("%dx%d \t", size, size); printf("%f [%.2fMB]\t", writeTime, writeMB); printf("%f\t", kernelTime); printf("%f [%.2fMB]\t", readTime, readMB); printf("%f\n\n", writeTime + kernelTime + readTime); } cl_freeMem(a_dev); cl_freeMem(b_dev); cl_freeMem(m_dev); cl_freeKernel(fan1_kernel); cl_freeKernel(fan2_kernel); cl_freeProgram(gaussianElim_program); } float eventTime(cl_event event, cl_command_queue command_queue) { cl_int error = 0; cl_ulong eventStart, eventEnd; clFinish(command_queue); error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &eventStart, NULL); cl_errChk(error, "ERROR in Event Profiling.", true); error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &eventEnd, NULL); cl_errChk(error, "ERROR in Event Profiling.", true); return (float)((eventEnd - eventStart) / 1e9); } int parseCommandline(int argc, char *argv[], char *filename, int *q, int *t, int *p, int *d) { int i; // if (argc < 2) return 1; // error strncpy(filename, "matrix4.txt", 100); char flag; for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { // flag flag = argv[i][1]; switch (flag) { case 'h': // help return 1; break; case 'q': // quiet *q = 1; break; case 't': // timing *t = 1; break; case 'p': // platform i++; *p = atoi(argv[i]); break; case 'd': // device i++; *d = atoi(argv[i]); break; } } } if ((*d >= 0 && *p < 0) || (*p >= 0 && *d < 0)) // both p and d must be specified if either are specified return 1; return 0; } void printUsage() { printf("Gaussian Elimination Usage\n"); printf("\n"); printf("gaussianElimination [filename] [-hqt] [-p [int] -d [int]]\n"); printf("\n"); printf("example:\n"); printf("$ ./gaussianElimination matrix4.txt\n"); printf("\n"); printf("filename the filename that holds the matrix data\n"); printf("\n"); printf("-h Display the help file\n"); printf("-q Quiet mode. Suppress all text output.\n"); printf("-t Print timing information.\n"); printf("\n"); printf("-p [int] Choose the platform (must choose both platform and " "device)\n"); printf("-d [int] Choose the device (must choose both platform and " "device)\n"); printf("\n"); printf("\n"); printf("Notes: 1. The filename is required as the first parameter.\n"); printf(" 2. If you declare either the device or the platform,\n"); printf(" you must declare both.\n\n"); } /*------------------------------------------------------ ** InitPerRun() -- Initialize the contents of the ** multipier matrix **m **------------------------------------------------------ */ void InitPerRun(int size, float *m) { int i; for (i = 0; i < size * size; i++) *(m + i) = 0.0; } void BackSub(float *a, float *b, float *finalVec, int size) { // solve "bottom up" int i, j; for (i = 0; i < size; i++) { finalVec[size - i - 1] = b[size - i - 1]; for (j = 0; j < i; j++) { finalVec[size - i - 1] -= *(a + size * (size - i - 1) + (size - j - 1)) * finalVec[size - j - 1]; } finalVec[size - i - 1] = finalVec[size - i - 1] / *(a + size * (size - i - 1) + (size - i - 1)); } } void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { fscanf(fp, "%f", ary + size * i + j); } } } /*------------------------------------------------------ ** InitAry() -- Initialize the array (vector) by reading ** data from the data file **------------------------------------------------------ */ void InitAry(FILE *fp, float *ary, int ary_size) { int i; for (i = 0; i < ary_size; i++) { fscanf(fp, "%f", &ary[i]); } } /*------------------------------------------------------ ** PrintMat() -- Print the contents of the matrix **------------------------------------------------------ */ void PrintMat(float *ary, int size, int nrow, int ncol) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { printf("%8.2f ", *(ary + size * i + j)); } printf("\n"); } printf("\n"); } /*------------------------------------------------------ ** PrintAry() -- Print the contents of the array (vector) **------------------------------------------------------ */ void PrintAry(float *ary, int ary_size) { int i; for (i = 0; i < ary_size; i++) { printf("%.2f ", ary[i]); } printf("\n\n"); } #endif