merge fixes
This commit is contained in:
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||
{
|
||||
// get index into global data array
|
||||
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||
|
||||
// bound check
|
||||
if (szGlobalOffset >= uiNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||
float4 f4A = fg4A[szGlobalOffset];
|
||||
float4 f4B = fg4B[szGlobalOffset];
|
||||
float4 f4H = (float4)0.0f;
|
||||
|
||||
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||
for (int i = 0; i < iInnerLoopCount; i++)
|
||||
{
|
||||
// compute the 4 hypotenuses using built-in function
|
||||
f4H.x = hypot (f4A.x, f4B.x);
|
||||
f4H.y = hypot (f4A.y, f4B.y);
|
||||
f4H.z = hypot (f4A.z, f4B.z);
|
||||
f4H.w = hypot (f4A.w, f4B.w);
|
||||
}
|
||||
|
||||
// Write 4 result values back out to GMEM
|
||||
fg4Hypot[szGlobalOffset] = f4H;
|
||||
}
|
||||
Reference in New Issue
Block a user