Add files via upload
This commit is contained in:
committed by
GitHub Enterprise
parent
aab920b1a2
commit
925b387990
273
benchmarks/opencl/reduce0/oclReduction_kernel.cl
Normal file
273
benchmarks/opencl/reduce0/oclReduction_kernel.cl
Normal file
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
Parallel reduction kernels
|
||||
*/
|
||||
|
||||
// The following defines are set during runtime compilation, see reduction.cpp
|
||||
// #define T float
|
||||
// #define blockSize 128
|
||||
// #define nIsPow2 1
|
||||
|
||||
#ifndef _REDUCE_KERNEL_H_
|
||||
#define _REDUCE_KERNEL_H_
|
||||
|
||||
/*
|
||||
Parallel sum reduction using shared memory
|
||||
- takes log(n) steps for n input elements
|
||||
- uses n threads
|
||||
- only works for power-of-2 arrays
|
||||
*/
|
||||
|
||||
/* This reduction interleaves which threads are active by using the modulo
|
||||
operator. This operator is very expensive on GPUs, and the interleaved
|
||||
inactivity means that no whole warps are active, which is also very
|
||||
inefficient */
|
||||
__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=1; s < get_local_size(0); s *= 2) {
|
||||
// modulo arithmetic is slow!
|
||||
if ((tid % (2*s)) == 0) {
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
|
||||
/* This version uses contiguous threads, but its interleaved
|
||||
addressing results in many shared memory bank conflicts. */
|
||||
__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=1; s < get_local_size(0); s *= 2)
|
||||
{
|
||||
int index = 2 * s * tid;
|
||||
|
||||
if (index < get_local_size(0))
|
||||
{
|
||||
sdata[index] += sdata[index + s];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version uses sequential addressing -- no divergence or bank conflicts.
|
||||
*/
|
||||
__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version uses n/2 threads --
|
||||
it performs the first level of reduction when reading from global memory
|
||||
*/
|
||||
__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + get_local_size(0) < n)
|
||||
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version unrolls the last warp to avoid synchronization where it
|
||||
isn't needed
|
||||
*/
|
||||
__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + get_local_size(0) < n)
|
||||
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
#pragma unroll 1
|
||||
for(unsigned int s=get_local_size(0)/2; s>32; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version is completely unrolled. It uses a template parameter to achieve
|
||||
optimal code for any (power of 2) number of threads. This requires a switch
|
||||
statement in the host code to handle all the different thread block sizes at
|
||||
compile time.
|
||||
*/
|
||||
__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + blockSize < n)
|
||||
sdata[tid] += g_idata[i+blockSize];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version adds multiple elements per thread sequentially. This reduces the overall
|
||||
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
|
||||
(Brent's Theorem optimization)
|
||||
*/
|
||||
__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
unsigned int gridSize = blockSize*2*get_num_groups(0);
|
||||
sdata[tid] = 0;
|
||||
|
||||
// we reduce multiple elements per thread. The number is determined by the
|
||||
// number of active thread blocks (via gridDim). More blocks will result
|
||||
// in a larger gridSize and therefore fewer elements per thread
|
||||
while (i < n)
|
||||
{
|
||||
sdata[tid] += g_idata[i];
|
||||
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
|
||||
if (nIsPow2 || i + blockSize < n)
|
||||
sdata[tid] += g_idata[i+blockSize];
|
||||
i += gridSize;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
#endif // #ifndef _REDUCE_KERNEL_H_
|
||||
Reference in New Issue
Block a user