diff --git a/benchmarks/opencl/reduce0/main.cc b/benchmarks/opencl/reduce0/main.cc new file mode 100644 index 00000000..bc2db24b --- /dev/null +++ b/benchmarks/opencl/reduce0/main.cc @@ -0,0 +1,638 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +/* + Parallel reduction + + This sample shows how to perform a reduction operation on an array of values + to produce a single value. + + Reductions are a very common computation in parallel algorithms. Any time + an array of values needs to be reduced to a single value using a binary + associative operator, a reduction can be used. Example applications include + statistics computaions such as mean and standard deviation, and image + processing applications such as finding the total luminance of an + image. + + This code performs sum reductions, but any associative operator such as + min() or max() could also be used. + + It assumes the input size is a power of 2. + + COMMAND LINE ARGUMENTS + + "--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels + "--n=": Specify the number of elements to reduce (default 1048576) + "--threads=": Specify the number of threads per block (default 128) + "--kernel=": Specify which kernel to run (0-6, default 6) + "--maxblocks=": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64) + "--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false) + "--cputhresh=": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1) + +*/ + +// Common system and utility includes +#include +#include + +// additional includes +#include +#include + +// Forward declarations and sample-specific defines +// ********************************************************************* +enum ReduceType +{ + REDUCE_INT, + REDUCE_FLOAT, + REDUCE_DOUBLE +}; + +template +bool runTest( int argc, const char** argv, ReduceType datatype); + +#define MAX_BLOCK_DIM_SIZE 65535 + +extern "C" +bool isPow2(unsigned int x) +{ + return ((x&(x-1))==0); +} + +cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2); + +// Main function +// ********************************************************************* +int main( int argc, const char** argv) +{ + shrQAStart(argc, (char **)argv); + + // start logs + shrSetLogFileName ("oclReduction.txt"); + shrLog("%s Starting...\n\n", argv[0]); + + char *typeChoice; + shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice); + + // determine type of array from command line args + if (0 == typeChoice) + { + typeChoice = (char*)malloc(7 * sizeof(char)); + #ifdef WIN32 + strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int"); + #else + strcpy(typeChoice, "int"); + #endif + } + ReduceType datatype = REDUCE_INT; + + #ifdef WIN32 + if (!_strcmpi(typeChoice, "float")) + datatype = REDUCE_FLOAT; + else if (!_strcmpi(typeChoice, "double")) + datatype = REDUCE_DOUBLE; + else + datatype = REDUCE_INT; + #else + if (!strcmp(typeChoice, "float")) + datatype = REDUCE_FLOAT; + else if (!strcmp(typeChoice, "double")) + datatype = REDUCE_DOUBLE; + else + datatype = REDUCE_INT; + #endif + + shrLog("Reducing array of type %s.\n", typeChoice); + + //Get the NVIDIA platform + ciErrNum = oclGetPlatformID(&cpPlatform); + //oclCheckError(ciErrNum, CL_SUCCESS); + + //Get the devices + ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices); + //oclCheckError(ciErrNum, CL_SUCCESS); + cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); + ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL); + //oclCheckError(ciErrNum, CL_SUCCESS); + + //Create the context + cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + // get and log the device info + if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { + int device_nr = 0; + shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr); + if( device_nr < uiNumDevices ) { + device = oclGetDev(cxGPUContext, device_nr); + } else { + shrLog("Invalid Device %d Requested.\n", device_nr); + shrExitEX(argc, argv, EXIT_FAILURE); + } + } else { + device = oclGetMaxFlopsDev(cxGPUContext); + } + oclPrintDevName(LOGBOTH, device); + shrLog("\n"); + + // create a command-queue + cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]); + + bool bSuccess = false; + switch (datatype) + { + default: + case REDUCE_INT: + bSuccess = runTest( argc, argv, datatype); + break; + case REDUCE_FLOAT: + bSuccess = runTest( argc, argv, datatype); + break; + } + + // finish + shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute sum reduction on CPU +//! We use Kahan summation for an accurate sum of large arrays. +//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm +//! +//! @param data pointer to input data +//! @param size number of input data elements +//////////////////////////////////////////////////////////////////////////////// +template +T reduceCPU(T *data, int size) +{ + T sum = data[0]; + T c = (T)0.0; + for (int i = 1; i < size; i++) + { + T y = data[i] - c; + T t = sum + y; + c = (t - sum) - y; + sum = t; + } + return sum; +} + +unsigned int nextPow2( unsigned int x ) { + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; +} + +//////////////////////////////////////////////////////////////////////////////// +// Compute the number of threads and blocks to use for the given reduction kernel +// For the kernels >= 3, we set threads / block to the minimum of maxThreads and +// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel +// 6, we observe the maximum specified number of blocks, because each thread in +// that kernel can process a variable number of elements. +//////////////////////////////////////////////////////////////////////////////// +void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads) +{ + if (whichKernel < 3) + { + threads = (n < maxThreads) ? nextPow2(n) : maxThreads; + blocks = (n + threads - 1) / threads; + } + else + { + threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads; + blocks = (n + (threads * 2 - 1)) / (threads * 2); + } + + + if (whichKernel == 6) + blocks = MIN(maxBlocks, blocks); +} + +//////////////////////////////////////////////////////////////////////////////// +// This function performs a reduction of the input data multiple times and +// measures the average reduction time. +//////////////////////////////////////////////////////////////////////////////// +template +T profileReduce(ReduceType datatype, + cl_int n, + int numThreads, + int numBlocks, + int maxThreads, + int maxBlocks, + int whichKernel, + int testIterations, + bool cpuFinalReduction, + int cpuFinalThreshold, + double* dTotalTime, + T* h_odata, + cl_mem d_idata, + cl_mem d_odata) +{ + + + T gpu_result = 0; + bool needReadBack = true; + cl_kernel finalReductionKernel[10]; + int finalReductionIterations=0; + + //shrLog("Profile Kernel %d\n", whichKernel); + + cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) ); + clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata); + clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata); + clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n); + clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL); + + if( !cpuFinalReduction ) { + int s=numBlocks; + int threads = 0, blocks = 0; + int kernel = (whichKernel == 6) ? 5 : whichKernel; + + while(s > cpuFinalThreshold) + { + getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); + + finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) ); + clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata); + clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata); + clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n); + clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL); + + if (kernel < 3) + s = (s + threads - 1) / threads; + else + s = (s + (threads*2-1)) / (threads*2); + + finalReductionIterations++; + } + } + + size_t globalWorkSize[1]; + size_t localWorkSize[1]; + + for (int i = 0; i < testIterations; ++i) + { + gpu_result = 0; + + clFinish(cqCommandQueue); + if(i>0) shrDeltaT(1); + + // execute the kernel + globalWorkSize[0] = numBlocks * numThreads; + localWorkSize[0] = numThreads; + + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize, + 0, NULL, NULL); + + // check if kernel execution generated an error + //oclCheckError(ciErrNum, CL_SUCCESS); + + if (cpuFinalReduction) + { + // sum partial sums from each block on CPU + // copy result from device to host + clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), + h_odata, 0, NULL, NULL); + + for(int i=0; i cpuFinalThreshold) + { + int threads = 0, blocks = 0; + getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); + + globalWorkSize[0] = threads * blocks; + localWorkSize[0] = threads; + + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0, + globalWorkSize, localWorkSize, 0, NULL, NULL); + //oclCheckError(ciErrNum, CL_SUCCESS); + + if (kernel < 3) + s = (s + threads - 1) / threads; + else + s = (s + (threads*2-1)) / (threads*2); + + it++; + } + + if (s > 1) + { + // copy result from device to host + clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), + h_odata, 0, NULL, NULL); + + for(int i=0; i < s; i++) + { + gpu_result += h_odata[i]; + } + + needReadBack = false; + } + } + + clFinish(cqCommandQueue); + if(i>0) *dTotalTime += shrDeltaT(1); + } + + if (needReadBack) + { + // copy final sum from device to host + clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), + &gpu_result, 0, NULL, NULL); + } + + // Release the kernels + clReleaseKernel(reductionKernel); + if( !cpuFinalReduction ) { + for(int it=0; it +void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) +{ + // create random input data on CPU + unsigned int bytes = maxN * sizeof(T); + + T* h_idata = (T*)malloc(bytes); + + for(int i = 0; i < maxN; i++) { + // Keep the numbers small so we don't get truncation error in the sum + if (datatype == REDUCE_INT) + h_idata[i] = (T)(rand() & 0xFF); + else + h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; + } + + int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); + + // allocate mem for the result on host side + T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); + + // allocate device memory and data + cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); + cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL); + + int testIterations = 100; + double dTotalTime = 0.0; + + // print headers + shrLog("Time in seconds for various numbers of elements for each kernel\n"); + shrLog("\n\n"); + shrLog("Kernel"); + for (int i = minN; i <= maxN; i *= 2) + { + shrLog(", %d", i); + } + + for (int kernel = 0; kernel < 7; kernel++) + { + shrLog("\n"); + shrLog("%d", kernel); + for (int i = minN; i <= maxN; i *= 2) + { + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); + + double reduceTime; + if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { + profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, + testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata); + reduceTime = dTotalTime/(double)testIterations; + } else { + reduceTime = -1.0; + } + shrLog(", %.4f m", reduceTime); + } + } + + // cleanup + free(h_idata); + free(h_odata); + clReleaseMemObject(d_idata); + clReleaseMemObject(d_odata); +} + +//////////////////////////////////////////////////////////////////////////////// +// The main function whihc runs the reduction test. +//////////////////////////////////////////////////////////////////////////////// +template +bool +runTest( int argc, const char** argv, ReduceType datatype) +{ + int size = 1<<24; // number of elements to reduce + int maxThreads; + + cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); + clReleaseKernel(reductionKernel); + + if (smallBlock) + maxThreads = 64; // number of threads per block + else + maxThreads = 128; + + int whichKernel = 6; + int maxBlocks = 64; + bool cpuFinalReduction = false; + int cpuFinalThreshold = 1; + + shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); + shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); + shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); + shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); + + shrLog(" %d elements\n", size); + shrLog(" %d threads (max)\n", maxThreads); + + cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); + shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); + + bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); + +#ifdef GPU_PROFILING + if (runShmoo) + { + shmoo(1, 33554432, maxThreads, maxBlocks, datatype); + return true; + } + else +#endif + { + // create random input data on CPU + unsigned int bytes = size * sizeof(T); + T* h_idata = (T*)malloc(bytes); + + for(int i=0; i(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, + whichKernel, testIterations, cpuFinalReduction, + cpuFinalThreshold, &dTotalTime, + h_odata, d_idata, d_odata); + +#ifdef GPU_PROFILING + double reduceTime = dTotalTime/(double)testIterations; + shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", + 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); +#endif + + // compute reference solution + shrLog("\nComparing against Host/C++ computation...\n"); + T cpu_result = reduceCPU(h_idata, size); + if (datatype == REDUCE_INT) + { + shrLog(" GPU result = %d\n", gpu_result); + shrLog(" CPU result = %d\n\n", cpu_result); + shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); + } + else + { + shrLog(" GPU result = %.9f\n", gpu_result); + shrLog(" CPU result = %.9f\n\n", cpu_result); + + double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; + double diff = abs((double)gpu_result - (double)cpu_result); + shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); + } + + // cleanup + free(h_idata); + free(h_odata); + clReleaseMemObject(d_idata); + clReleaseMemObject(d_odata); + + return (gpu_result == cpu_result); + } +} + +// Helper function to create and build program and kernel +// ********************************************************************* +cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2) +{ + // compile cl program + size_t program_length; + char *source; + + std::ostringstream preamble; + + // create the program + // with type specification depending on datatype argument + switch (datatype) + { + default: + case REDUCE_INT: + preamble << "#define T int" << std::endl; + break; + case REDUCE_FLOAT: + preamble << "#define T float" << std::endl; + break; + } + + // set blockSize at compile time + preamble << "#define blockSize " << blockSize << std::endl; + + // set isPow2 at compile time + preamble << "#define nIsPow2 " << isPowOf2 << std::endl; + + // Load the source code and prepend the preamble + source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length); + //oclCheckError(source != NULL, shrTRUE); + + program = + clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL); + //cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, + // &program_length, &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + free(source); + + // build the program + ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); + if (ciErrNum != CL_SUCCESS) + { + // write out standard error, Build Log and PTX, then cleanup and exit + shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); + oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext)); + oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx"); + //oclCheckError(ciErrNum, CL_SUCCESS); + } + + // create Kernel + std::ostringstream kernelName; + kernelName << "reduce" << whichKernel; + cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum); + //oclCheckError(ciErrNum, CL_SUCCESS); + + size_t wgSize; + ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); + if (wgSize == 64) + smallBlock = true; + else smallBlock = false; + + // NOTE: the program will get deleted when the kernel is also released + clReleaseProgram(rv_program); + + return ckKernel; +} diff --git a/benchmarks/opencl/reduce0/oclReduction.h b/benchmarks/opencl/reduce0/oclReduction.h new file mode 100644 index 00000000..865bb24d --- /dev/null +++ b/benchmarks/opencl/reduce0/oclReduction.h @@ -0,0 +1,34 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + #ifndef __REDUCTION_H__ +#define __REDUCTION_H__ + +template +void reduce_sm10(int size, int threads, int blocks, + int whichKernel, T *d_idata, T *d_odata); + +template +void reduce_sm13(int size, int threads, int blocks, + int whichKernel, T *d_idata, T *d_odata); + +// CL objects +cl_platform_id cpPlatform; +cl_uint uiNumDevices; +cl_device_id* cdDevices; +cl_context cxGPUContext; +cl_command_queue cqCommandQueue; +cl_device_id device; +cl_int ciErrNum; +const char* source_path; +bool smallBlock = true; + +#endif diff --git a/benchmarks/opencl/reduce0/oclReduction_kernel.cl b/benchmarks/opencl/reduce0/oclReduction_kernel.cl new file mode 100644 index 00000000..a32c01c4 --- /dev/null +++ b/benchmarks/opencl/reduce0/oclReduction_kernel.cl @@ -0,0 +1,273 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + Parallel reduction kernels +*/ + +// The following defines are set during runtime compilation, see reduction.cpp +// #define T float +// #define blockSize 128 +// #define nIsPow2 1 + +#ifndef _REDUCE_KERNEL_H_ +#define _REDUCE_KERNEL_H_ + +/* + Parallel sum reduction using shared memory + - takes log(n) steps for n input elements + - uses n threads + - only works for power-of-2 arrays +*/ + +/* This reduction interleaves which threads are active by using the modulo + operator. This operator is very expensive on GPUs, and the interleaved + inactivity means that no whole warps are active, which is also very + inefficient */ +__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata) +{ + // load shared mem + unsigned int tid = get_local_id(0); + unsigned int i = get_global_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s=1; s < get_local_size(0); s *= 2) { + // modulo arithmetic is slow! + if ((tid % (2*s)) == 0) { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + + +/* This version uses contiguous threads, but its interleaved + addressing results in many shared memory bank conflicts. */ +__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata) +{ + // load shared mem + unsigned int tid = get_local_id(0); + unsigned int i = get_global_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s=1; s < get_local_size(0); s *= 2) + { + int index = 2 * s * tid; + + if (index < get_local_size(0)) + { + sdata[index] += sdata[index + s]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +/* + This version uses sequential addressing -- no divergence or bank conflicts. +*/ +__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata) +{ + // load shared mem + unsigned int tid = get_local_id(0); + unsigned int i = get_global_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +/* + This version uses n/2 threads -- + it performs the first level of reduction when reading from global memory +*/ +__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata) +{ + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = get_local_id(0); + unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + if (i + get_local_size(0) < n) + sdata[tid] += g_idata[i+get_local_size(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +/* + This version unrolls the last warp to avoid synchronization where it + isn't needed +*/ +__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata) +{ + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = get_local_id(0); + unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + if (i + get_local_size(0) < n) + sdata[tid] += g_idata[i+get_local_size(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + #pragma unroll 1 + for(unsigned int s=get_local_size(0)/2; s>32; s>>=1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (tid < 32) + { + if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; } + if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; } + if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; } + if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; } + if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; } + if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; } + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +/* + This version is completely unrolled. It uses a template parameter to achieve + optimal code for any (power of 2) number of threads. This requires a switch + statement in the host code to handle all the different thread block sizes at + compile time. +*/ +__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata) +{ + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = get_local_id(0); + unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + + sdata[tid] = (i < n) ? g_idata[i] : 0; + if (i + blockSize < n) + sdata[tid] += g_idata[i+blockSize]; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + if (tid < 32) + { + if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; } + if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; } + if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; } + if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; } + if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; } + if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; } + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +/* + This version adds multiple elements per thread sequentially. This reduces the overall + cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). + (Brent's Theorem optimization) +*/ +__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata) +{ + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = get_local_id(0); + unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + unsigned int gridSize = blockSize*2*get_num_groups(0); + sdata[tid] = 0; + + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + while (i < n) + { + sdata[tid] += g_idata[i]; + // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays + if (nIsPow2 || i + blockSize < n) + sdata[tid] += g_idata[i+blockSize]; + i += gridSize; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + + if (tid < 32) + { + if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; } + if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; } + if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; } + if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; } + if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; } + if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; } + } + + // write result for this block to global mem + if (tid == 0) g_odata[get_group_id(0)] = sdata[0]; +} + +#endif // #ifndef _REDUCE_KERNEL_H_ diff --git a/benchmarks/opencl/reduce0/oclUtils.h b/benchmarks/opencl/reduce0/oclUtils.h new file mode 100644 index 00000000..2b109e18 --- /dev/null +++ b/benchmarks/opencl/reduce0/oclUtils.h @@ -0,0 +1,198 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef OCL_UTILS_H +#define OCL_UTILS_H + +// ********************************************************************* +// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK +// ********************************************************************* + +// Common headers: Cross-API utililties and OpenCL header +#include + +// All OpenCL headers +#if defined (__APPLE__) || defined(MACOSX) + #include +#else + #include +#endif + +// Includes +#include +#include +#include + +// For systems with CL_EXT that are not updated with these extensions, we copied these +// extensions from +#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ + #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 + #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 + #define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 + #define CL_DEVICE_WARP_SIZE_NV 0x4003 + #define CL_DEVICE_GPU_OVERLAP_NV 0x4004 + #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 + #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 +#endif + +// reminders for build output window and log +#ifdef _WIN32 + #pragma message ("Note: including shrUtils.h") + #pragma message ("Note: including opencl.h") +#endif + +// SDK Revision # +#define OCL_SDKREVISION "7027912" + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) + +////////////////////////////////////////////////////////////////////////////// +//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0 +//! +//! @return the id +//! @param clSelectedPlatformID OpenCL platform ID +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID); + +////////////////////////////////////////////////////////////////////////////// +//! Print info about the device +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Get and return device capability +//! +//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" int oclGetDevCap(cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Print the device name +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevName(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the first device from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the nth device from the context +//! +//! @return the id or -1 when out of range +//! @param cxGPUContext OpenCL context +//! @param device_idx index of the device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of device with maximal FLOPS from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program file and prepends the cPreamble to the code. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param cFilename program filename +//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// +extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); + +////////////////////////////////////////////////////////////////////////////// +//! Get the binary (PTX) of the program associated with the device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param binary returned code +//! @param length length of returned code +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param const char* cPtxFileName optional PTX file name +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the Build Log from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice); + +// Helper function for De-allocating cl objects +// ********************************************************************* +extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs); + +// Helper function to get OpenCL error string from constant +// ********************************************************************* +extern "C" const char* oclErrorString(cl_int error); + +// Helper function to get OpenCL image format string (channel order and type) from constant +// ********************************************************************* +extern "C" const char* oclImageFormatString(cl_uint uiImageFormat); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + // An error condition is defined by the sample/test value not equal to the reference + if (iReference != iSample) + { + // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value + iSample = (iSample == 0) ? -9999 : iSample; + + // Log the error info + shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile); + + // Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code. + if (pCleanup != NULL) + { + pCleanup(iSample); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(iSample); + } + } +} + +#endif \ No newline at end of file diff --git a/benchmarks/opencl/reduce0/shrQATest.h b/benchmarks/opencl/reduce0/shrQATest.h new file mode 100644 index 00000000..245cf8dc --- /dev/null +++ b/benchmarks/opencl/reduce0/shrQATest.h @@ -0,0 +1,238 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_QATEST_H +#define SHR_QATEST_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// OS dependent includes +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") + #pragma message ("Note: including time.h") + +// Headers needed for Windows + #include + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include + #include + #include +#endif + +#ifndef STRCASECMP +#ifdef _WIN32 +#define STRCASECMP _stricmp +#else +#define STRCASECMP strcasecmp +#endif +#endif + +#ifndef STRNCASECMP +#ifdef _WIN32 +#define STRNCASECMP _strnicmp +#else +#define STRNCASECMP strncasecmp +#endif +#endif + + +// Standardized QA Start/Finish for CUDA SDK tests +#define shrQAStart(a, b) __shrQAStart(a, b) +#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c) +#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d) + +inline int findExeNameStart(const char *exec_name) +{ + int exename_start = (int)strlen(exec_name); + + while( (exename_start > 0) && + (exec_name[exename_start] != '\\') && + (exec_name[exename_start] != '/') ) + { + exename_start--; + } + if (exec_name[exename_start] == '\\' || + exec_name[exename_start] == '/') + { + return exename_start+1; + } else { + return exename_start; + } +} + +inline int __shrQAStart(int argc, char **argv) +{ + bool bQATest = false; + // First clear the output buffer + fflush(stdout); + fflush(stdout); + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + char *string_argv = &argv[i][string_start]; + + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + } + + // We don't want to print the entire path, so we search for the first + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start])); + } + fflush(stdout); + printf("\n"); fflush(stdout); + return exename_start; +} + +enum eQAstatus { + QA_FAILED = 0, + QA_PASSED = 1, + QA_WAIVED = 2 +}; + +inline void __ExitInTime(int seconds) +{ + fprintf(stdout, "> exiting in %d seconds: ", seconds); + fflush(stdout); + time_t t; + int count; + for (t=time(0)+seconds, count=seconds; time(0) < t; count--) { + fprintf(stdout, "%d...", count); +#ifdef WIN32 + Sleep(1000); +#else + sleep(1); +#endif + } + fprintf(stdout,"done!\n\n"); + fflush(stdout); +} + + +inline void __shrQAFinish(int argc, const char **argv, int iStatus) +{ + // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() ) + bool bQATest = false, bNoPrompt = true, bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bNoPrompt = true; + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bNoPrompt = false; + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + printf("\n"); fflush(stdout); + if (bQuitInTime) { + __ExitInTime(3); + } else { + if (!bNoPrompt) { + fprintf(stdout, "\nPress to exit...\n"); + fflush(stdout); + getchar(); + } + } +} + +inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus) +{ + bool bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + + if (bQuitInTime) { + __ExitInTime(3); + } +} + +inline void shrQAFinishExit(int argc, const char **argv, int iStatus) +{ + __shrQAFinish(argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus) +{ + __shrQAFinish2(bQAtest, argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +#endif \ No newline at end of file diff --git a/benchmarks/opencl/reduce0/shrUtils.h b/benchmarks/opencl/reduce0/shrUtils.h new file mode 100644 index 00000000..0f2795d4 --- /dev/null +++ b/benchmarks/opencl/reduce0/shrUtils.h @@ -0,0 +1,642 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_UTILS_H +#define SHR_UTILS_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// reminders for output window and build log +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") +#endif + +// OS dependent includes +#ifdef _WIN32 + // Headers needed for Windows + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include +#endif + +// Other headers needed for both Windows and Linux +#include +#include +#include +#include +#include + +// Un-comment the following #define to enable profiling code in SDK apps +//#define GPU_PROFILING + +// Beginning of GPU Architecture definitions +inline int ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class + { -1, -1 } + }; + + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor); + return -1; +} +// end of GPU Architecture definitions + + +// Defines and enum for use with logging functions +// ********************************************************************* +#define DEFAULTLOGFILE "SdkConsoleLog.txt" +#define MASTERLOGFILE "SdkMasterLog.csv" +enum LOGMODES +{ + LOGCONSOLE = 1, // bit to signal "log to console" + LOGFILE = 2, // bit to signal "log to file" + LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both" + APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open + MASTER = 8, // bit to signal master .csv log output + ERRORMSG = 16, // bit to signal "pre-pend Error" + CLOSELOG = 32 // bit to close log file, if open, after any requested file write +}; +#define HDASHLINE "-----------------------------------------------------------\n" + +// Standardized boolean +enum shrBOOL +{ + shrFALSE = 0, + shrTRUE = 1 +}; + +// Standardized MAX, MIN and CLAMP +#define MAX(a, b) ((a > b) ? a : b) +#define MIN(a, b) ((a < b) ? a : b) +#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a +#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) + +// Standardized Exit Macro for leaving main()... extended version +// (Companion Inline Function lower on page) +#define shrExitEX(a, b, c) __shrExitEX(a, b, c) + +// Standardized Exit Macro for leaving main()... short version +// (Companion Inline Function lower on page) +#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS) + +// Simple argument checker macro +#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE + +// Define for user-customized error handling +#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__ + +// Function to deallocate memory allocated within shrUtils +// ********************************************************************* +extern "C" void shrFree(void* ptr); + +// ********************************************************************* +// Helper function to log standardized information to Console, to File or to both +//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); +//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); +//! +//! Automatically opens file and stores handle if needed and not done yet +//! Closes file and nulls handle on request +//! +//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG. +//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode +//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close" +//! First 3 options may be | 'd with MASTER to enable independent write to master data log file +//! First 3 options may be | 'd with ERRORMSG to start line with standard error message +//! @param 2 dValue: +//! Positive val = double value for time in secs to be formatted to 6 decimals. +//! Negative val is an error code and this give error preformatting. +//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf. +//! ALL printf flags, width, precision and type specifiers are supported with this exception: +//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported +//! Single byte char type specifiers (%s and %c) ARE supported +//! @param 4... variable args: like printf or fprintf. Must match format specifer type above. +//! @return 0 if OK, negative value on error or if error occurs or was passed in. +// ********************************************************************* +extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...); + +// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, +// ********************************************************************* +extern "C" int shrLog(const char* cFormatString, ...); + +// ********************************************************************* +// Delta timer function for up to 3 independent timers using host high performance counters +// Maintains state for 3 independent counters +//! Example: double dElapsedTime = shrDeltaTime(0); +//! +//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2) +//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error +// ********************************************************************* +extern "C" double shrDeltaT(int iCounterID); + +// Optional LogFileNameOverride function +// ********************************************************************* +extern "C" void shrSetLogFileName (const char* cOverRideName); + +// Helper function to init data arrays +// ********************************************************************* +extern "C" void shrFillArray(float* pfData, int iSize); + +// Helper function to print data arrays +// ********************************************************************* +extern "C" void shrPrintArray(float* pfData, int iSize); + +//////////////////////////////////////////////////////////////////////////// +//! Find the path for a filename +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executablePath optional absolute path of the executable +//////////////////////////////////////////////////////////////////////////// +extern "C" char* shrFindFilePath(const char* filename, const char* executablePath); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing single precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing double precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing single precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len, + const float epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing double precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len, + const double epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Load PPM image file (with unsigned char as data element type), padding +//! 4th component +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param OutData handle to the data read +//! @param w width of the image +//! @param h height of the image +//! +//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PPM image file (with unsigned char as data element type, padded to +//! 4 bytes) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +// Command line arguments: General notes +// * All command line arguments begin with '--' followed by the token; +// token and value are seperated by '='; example --samples=50 +// * Arrays have the form --model=[one.obj,two.obj,three.obj] +// (without whitespaces) +//////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////// +//! Check if command line argument \a flag-name is given +//! @return shrTRUE if command line argument \a flag_name has been given, +//! otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param flag_name name of command line flag +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, + const char* flag_name); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, + const char* arg_name, int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type unsigned int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, + const char* arg_name, unsigned int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type float +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, + const char* arg_name, float* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type string +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, + const char* arg_name, char** val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument list those element are strings +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val command line argument list +//! @param len length of the list / number of elements +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, + const char* arg_name, char** val, + unsigned int* len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparef( const float* reference, const float* data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparei( const int* reference, const int* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned integer arrays, with epsilon and threshold +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned char arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integers with a tolernance for # of byte errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays witha n epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality and a +//! threshold for # pixel errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefet( const float* reference, const float* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays using L2-norm with an epsilon tolerance for +//! equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PPM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PGM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size); + +extern "C" size_t shrRoundUp(int group_size, int global_size); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + if (iReference != iSample) + { + shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); + if (pCleanup != NULL) + { + pCleanup(EXIT_FAILURE); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(EXIT_FAILURE); + } + } +} + +// Standardized Exit +// ********************************************************************* +inline void __shrExitEX(int argc, const char** argv, int iExitCode) +{ +#ifdef WIN32 + if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#else + if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#endif + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress to Quit...\n"); + getchar(); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); + } + fflush(stderr); + exit(iExitCode); +} + +#endif \ No newline at end of file