Add files via upload

2019-11-25 01:38:35 -05:00
parent aab920b1a2
commit 925b387990
6 changed files with 2023 additions and 0 deletions
--- a/benchmarks/opencl/reduce0/oclReduction_kernel.cl
+++ b/benchmarks/opencl/reduce0/oclReduction_kernel.cl
@@ -0,0 +1,273 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    Parallel reduction kernels
+*/
+
+// The following defines are set during runtime compilation, see reduction.cpp
+// #define T float
+// #define blockSize 128
+// #define nIsPow2 1
+
+#ifndef _REDUCE_KERNEL_H_
+#define _REDUCE_KERNEL_H_
+
+/*
+    Parallel sum reduction using shared memory
+    - takes log(n) steps for n input elements
+    - uses n threads
+    - only works for power-of-2 arrays
+*/
+
+/* This reduction interleaves which threads are active by using the modulo
+   operator.  This operator is very expensive on GPUs, and the interleaved 
+   inactivity means that no whole warps are active, which is also very 
+   inefficient */
+__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) {
+        // modulo arithmetic is slow!
+        if ((tid % (2*s)) == 0) {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+
+/* This version uses contiguous threads, but its interleaved 
+   addressing results in many shared memory bank conflicts. */
+__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) 
+    {
+        int index = 2 * s * tid;
+
+        if (index < get_local_size(0)) 
+        {
+            sdata[index] += sdata[index + s];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses sequential addressing -- no divergence or bank conflicts.
+*/
+__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses n/2 threads --
+    it performs the first level of reduction when reading from global memory
+*/
+__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version unrolls the last warp to avoid synchronization where it 
+    isn't needed
+*/
+__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    #pragma unroll 1
+    for(unsigned int s=get_local_size(0)/2; s>32; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version is completely unrolled.  It uses a template parameter to achieve 
+    optimal code for any (power of 2) number of threads.  This requires a switch 
+    statement in the host code to handle all the different thread block sizes at 
+    compile time.
+*/
+__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + blockSize < n) 
+        sdata[tid] += g_idata[i+blockSize];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version adds multiple elements per thread sequentially.  This reduces the overall
+    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
+    (Brent's Theorem optimization)
+*/
+__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+    unsigned int gridSize = blockSize*2*get_num_groups(0);
+    sdata[tid] = 0;
+
+    // we reduce multiple elements per thread.  The number is determined by the 
+    // number of active thread blocks (via gridDim).  More blocks will result
+    // in a larger gridSize and therefore fewer elements per thread
+    while (i < n)
+    {         
+        sdata[tid] += g_idata[i];
+        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
+        if (nIsPow2 || i + blockSize < n) 
+            sdata[tid] += g_idata[i+blockSize];  
+        i += gridSize;
+    } 
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+#endif // #ifndef _REDUCE_KERNEL_H_