Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/tests/opencl/transpose/kernel.cl
+++ b/tests/opencl/transpose/kernel.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Matrix transpose with OpenCL
+* Device code.
+*/
+
+#define BLOCK_DIM 16
+
+// This kernel is optimized to ensure all global reads and writes are coalesced,
+// and to avoid bank conflicts in shared memory.  This kernel is up to 11x faster
+// than the naive kernel below.  Note that the shared memory array is sized to 
+// (BLOCK_DIM+1)*BLOCK_DIM.  This pads each row of the 2D block in shared memory 
+// so that bank conflicts do not occur when threads address the array column-wise.
+__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+	if((xIndex + offset < width) && (yIndex < height))
+	{
+		unsigned int index_in = yIndex * width + xIndex + offset;
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// write the transposed matrix tile to global memory
+	xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
+	yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
+	if((xIndex < height) && (yIndex + offset < width))
+    {
+		unsigned int index_out = yIndex * height + xIndex;
+		odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
+	}
+}
+
+
+
+// This naive transpose kernel suffers from completely non-coalesced writes.
+// It can be up to 10x slower than the kernel above for large matrices.
+__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        unsigned int index_out = yIndex + height * xIndex;
+        odata[index_out] = idata[index_in]; 
+    }
+}
+
+
+__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        odata[index_in] = idata[index_in]; 
+    }
+}
+
+__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+    unsigned int index_in = yIndex * width + xIndex + offset;
+	if((xIndex + offset< width) && (yIndex < height))
+	{
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if((xIndex < height) && (yIndex+ offset < width))
+    {
+		odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
+	}
+}
+
+
+__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = yIndex + height * (xIndex+ offset);
+        odata[index_in] = idata[index_in]; 
+    }
+}