Files
vortex/tests/opencl/transpose/kernel.cl
Blaise Tine c1e168fdbe Vortex 2.0 changes:
+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
2023-11-10 02:47:05 -08:00

109 lines
3.6 KiB
Common Lisp

/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Matrix transpose with OpenCL
* Device code.
*/
#define BLOCK_DIM 16
// This kernel is optimized to ensure all global reads and writes are coalesced,
// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster
// than the naive kernel below. Note that the shared memory array is sized to
// (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory
// so that bank conflicts do not occur when threads address the array column-wise.
__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
{
// read the matrix tile into shared memory
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if((xIndex + offset < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex + offset;
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write the transposed matrix tile to global memory
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
if((xIndex < height) && (yIndex + offset < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
}
}
// This naive transpose kernel suffers from completely non-coalesced writes.
// It can be up to 10x slower than the kernel above for large matrices.
__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
{
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if (xIndex + offset < width && yIndex < height)
{
unsigned int index_in = xIndex + offset + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
odata[index_out] = idata[index_in];
}
}
__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
{
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if (xIndex + offset < width && yIndex < height)
{
unsigned int index_in = xIndex + offset + width * yIndex;
odata[index_in] = idata[index_in];
}
}
__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
{
// read the matrix tile into shared memory
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
unsigned int index_in = yIndex * width + xIndex + offset;
if((xIndex + offset< width) && (yIndex < height))
{
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
if((xIndex < height) && (yIndex+ offset < width))
{
odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
}
}
__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
{
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if (xIndex + offset < width && yIndex < height)
{
unsigned int index_in = yIndex + height * (xIndex+ offset);
odata[index_in] = idata[index_in];
}
}