Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
This commit is contained in:
108
tests/opencl/transpose/kernel.cl
Normal file
108
tests/opencl/transpose/kernel.cl
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Matrix transpose with OpenCL
|
||||
* Device code.
|
||||
*/
|
||||
|
||||
#define BLOCK_DIM 16
|
||||
|
||||
// This kernel is optimized to ensure all global reads and writes are coalesced,
|
||||
// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster
|
||||
// than the naive kernel below. Note that the shared memory array is sized to
|
||||
// (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory
|
||||
// so that bank conflicts do not occur when threads address the array column-wise.
|
||||
__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||
{
|
||||
// read the matrix tile into shared memory
|
||||
unsigned int xIndex = get_global_id(0);
|
||||
unsigned int yIndex = get_global_id(1);
|
||||
|
||||
if((xIndex + offset < width) && (yIndex < height))
|
||||
{
|
||||
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// write the transposed matrix tile to global memory
|
||||
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
|
||||
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
|
||||
if((xIndex < height) && (yIndex + offset < width))
|
||||
{
|
||||
unsigned int index_out = yIndex * height + xIndex;
|
||||
odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This naive transpose kernel suffers from completely non-coalesced writes.
|
||||
// It can be up to 10x slower than the kernel above for large matrices.
|
||||
__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||
{
|
||||
unsigned int xIndex = get_global_id(0);
|
||||
unsigned int yIndex = get_global_id(1);
|
||||
|
||||
if (xIndex + offset < width && yIndex < height)
|
||||
{
|
||||
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||
unsigned int index_out = yIndex + height * xIndex;
|
||||
odata[index_out] = idata[index_in];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||
{
|
||||
unsigned int xIndex = get_global_id(0);
|
||||
unsigned int yIndex = get_global_id(1);
|
||||
|
||||
if (xIndex + offset < width && yIndex < height)
|
||||
{
|
||||
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||
odata[index_in] = idata[index_in];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||
{
|
||||
// read the matrix tile into shared memory
|
||||
unsigned int xIndex = get_global_id(0);
|
||||
unsigned int yIndex = get_global_id(1);
|
||||
|
||||
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||
if((xIndex + offset< width) && (yIndex < height))
|
||||
{
|
||||
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if((xIndex < height) && (yIndex+ offset < width))
|
||||
{
|
||||
odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||
{
|
||||
unsigned int xIndex = get_global_id(0);
|
||||
unsigned int yIndex = get_global_id(1);
|
||||
|
||||
if (xIndex + offset < width && yIndex < height)
|
||||
{
|
||||
unsigned int index_in = yIndex + height * (xIndex+ offset);
|
||||
odata[index_in] = idata[index_in];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user