two more benchmarks
This commit is contained in:
44
benchmarks/new_opencl/convolution/Makefile
Normal file
44
benchmarks/new_opencl/convolution/Makefile
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc)
|
||||||
|
POCLRT_PATH ?= $(wildcard ..)
|
||||||
|
DRIVER_PATH ?= $(wildcard ../../../driver/sw)
|
||||||
|
|
||||||
|
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||||
|
|
||||||
|
CXXFLAGS += -I$(POCLRT_PATH)/include
|
||||||
|
|
||||||
|
LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
|
||||||
|
|
||||||
|
PROJECT = convolution
|
||||||
|
|
||||||
|
SRCS = main.cpp utils.cpp
|
||||||
|
|
||||||
|
all: $(PROJECT)
|
||||||
|
|
||||||
|
kernel.pocl: kernel.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT): $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
run-fpga: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-ase: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-simx: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-rtlsim: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
.depend: $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(PROJECT) *.o *.dump .depend
|
||||||
|
|
||||||
|
ifneq ($(MAKECMDGOALS),clean)
|
||||||
|
-include .depend
|
||||||
|
endif
|
||||||
BIN
benchmarks/new_opencl/convolution/input.bmp
Normal file
BIN
benchmarks/new_opencl/convolution/input.bmp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
54
benchmarks/new_opencl/convolution/kernel.cl
Executable file
54
benchmarks/new_opencl/convolution/kernel.cl
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
__kernel
|
||||||
|
void convolution(
|
||||||
|
__read_only image2d_t sourceImage,
|
||||||
|
__write_only image2d_t outputImage,
|
||||||
|
int rows,
|
||||||
|
int cols,
|
||||||
|
__constant float* filter,
|
||||||
|
int filterWidth,
|
||||||
|
sampler_t sampler)
|
||||||
|
{
|
||||||
|
// Store each work-item’s unique row and column
|
||||||
|
int column = get_global_id(0);
|
||||||
|
int row = get_global_id(1);
|
||||||
|
|
||||||
|
// Half the width of the filter is needed for indexing
|
||||||
|
// memory later
|
||||||
|
int halfWidth = (int)(filterWidth/2);
|
||||||
|
|
||||||
|
// All accesses to images return data as four-element vector
|
||||||
|
// (i.e., float4), although only the 'x' component will contain
|
||||||
|
// meaningful data in this code
|
||||||
|
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||||
|
|
||||||
|
// Iterator for the filter
|
||||||
|
int filterIdx = 0;
|
||||||
|
|
||||||
|
// Each work-item iterates around its local area based on the
|
||||||
|
// size of the filter
|
||||||
|
int2 coords; // Coordinates for accessing the image
|
||||||
|
// Iterate the filter rows
|
||||||
|
for(int i = -halfWidth; i <= halfWidth; i++) {
|
||||||
|
coords.y = row + i;
|
||||||
|
|
||||||
|
// Iterate over the filter columns
|
||||||
|
for(int j = -halfWidth; j <= halfWidth; j++) {
|
||||||
|
coords.x = column + j;
|
||||||
|
|
||||||
|
float4 pixel;
|
||||||
|
// Read a pixel from the image. A single channel image
|
||||||
|
// stores the pixel in the 'x' coordinate of the returned
|
||||||
|
// vector.
|
||||||
|
pixel = read_imagef(sourceImage, sampler, coords);
|
||||||
|
sum.x += pixel.x * filter[filterIdx++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy the data to the output image if the
|
||||||
|
// work-item is in bounds
|
||||||
|
if(row < rows && column < cols) {
|
||||||
|
coords.x = column;
|
||||||
|
coords.y = row;
|
||||||
|
write_imagef(outputImage, coords, sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
261
benchmarks/new_opencl/convolution/main.cpp
Executable file
261
benchmarks/new_opencl/convolution/main.cpp
Executable file
@@ -0,0 +1,261 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
// This function takes a positive integer and rounds it up to
|
||||||
|
// the nearest multiple of another provided integer
|
||||||
|
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||||
|
|
||||||
|
// Determine how far past the nearest multiple the value is
|
||||||
|
unsigned int remainder = value % multiple;
|
||||||
|
|
||||||
|
// Add the difference to make the value a multiple
|
||||||
|
if(remainder != 0) {
|
||||||
|
value += (multiple-remainder);
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function reads in a text file and stores it as a char pointer
|
||||||
|
char* readSource(char* kernelPath) {
|
||||||
|
|
||||||
|
cl_int status;
|
||||||
|
FILE *fp;
|
||||||
|
char *source;
|
||||||
|
long int size;
|
||||||
|
|
||||||
|
printf("Program file is: %s\n", kernelPath);
|
||||||
|
|
||||||
|
fp = fopen(kernelPath, "rb");
|
||||||
|
if(!fp) {
|
||||||
|
printf("Could not open kernel file\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
status = fseek(fp, 0, SEEK_END);
|
||||||
|
if(status != 0) {
|
||||||
|
printf("Error seeking to end of file\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
size = ftell(fp);
|
||||||
|
if(size < 0) {
|
||||||
|
printf("Error getting file position\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
source = (char *)malloc(size + 1);
|
||||||
|
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < size+1; i++) {
|
||||||
|
source[i]='\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
if(source == NULL) {
|
||||||
|
printf("Error allocating space for the kernel source\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fread(source, 1, size, fp);
|
||||||
|
source[size] = '\0';
|
||||||
|
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
void chk(cl_int status, const char* cmd) {
|
||||||
|
|
||||||
|
if(status != CL_SUCCESS) {
|
||||||
|
printf("%s failed (%d)\n", cmd, status);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
int i, j, k, l;
|
||||||
|
|
||||||
|
// Rows and columns in the input image
|
||||||
|
int imageHeight;
|
||||||
|
int imageWidth;
|
||||||
|
|
||||||
|
const char* inputFile = "input.bmp";
|
||||||
|
const char* outputFile = "output.bmp";
|
||||||
|
|
||||||
|
// Homegrown function to read a BMP from file
|
||||||
|
float* inputImage = readImage(inputFile, &imageWidth,
|
||||||
|
&imageHeight);
|
||||||
|
|
||||||
|
// Size of the input and output images on the host
|
||||||
|
int dataSize = imageHeight*imageWidth*sizeof(float);
|
||||||
|
|
||||||
|
// Output image on the host
|
||||||
|
float* outputImage = NULL;
|
||||||
|
outputImage = (float*)malloc(dataSize);
|
||||||
|
float* refImage = NULL;
|
||||||
|
refImage = (float*)malloc(dataSize);
|
||||||
|
|
||||||
|
// 45 degree motion blur
|
||||||
|
float filter[49] =
|
||||||
|
{0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, -1, 0, 1, 0, 0,
|
||||||
|
0, 0, -2, 0, 2, 0, 0,
|
||||||
|
0, 0, -1, 0, 1, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0};
|
||||||
|
|
||||||
|
// The convolution filter is 7x7
|
||||||
|
int filterWidth = 7;
|
||||||
|
int filterSize = filterWidth*filterWidth; // Assume a square kernel
|
||||||
|
|
||||||
|
// Set up the OpenCL environment
|
||||||
|
cl_int status;
|
||||||
|
|
||||||
|
// Discovery platform
|
||||||
|
cl_platform_id platform;
|
||||||
|
status = clGetPlatformIDs(1, &platform, NULL);
|
||||||
|
chk(status, "clGetPlatformIDs");
|
||||||
|
|
||||||
|
// Discover device
|
||||||
|
cl_device_id device;
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||||
|
chk(status, "clGetDeviceIDs");
|
||||||
|
|
||||||
|
// Create context
|
||||||
|
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
|
||||||
|
(cl_context_properties)(platform), 0};
|
||||||
|
cl_context context;
|
||||||
|
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
|
||||||
|
chk(status, "clCreateContext");
|
||||||
|
|
||||||
|
// Create command queue
|
||||||
|
cl_command_queue queue;
|
||||||
|
queue = clCreateCommandQueue(context, device, 0, &status);
|
||||||
|
chk(status, "clCreateCommandQueue");
|
||||||
|
|
||||||
|
// The image format describes how the data will be stored in memory
|
||||||
|
cl_image_format format;
|
||||||
|
format.image_channel_order = CL_R; // single channel
|
||||||
|
format.image_channel_data_type = CL_FLOAT; // float data type
|
||||||
|
|
||||||
|
// Create space for the source image on the device
|
||||||
|
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||||
|
imageHeight, 0, NULL, &status);
|
||||||
|
chk(status, "clCreateImage2D");
|
||||||
|
|
||||||
|
// Create space for the output image on the device
|
||||||
|
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||||
|
imageHeight, 0, NULL, &status);
|
||||||
|
chk(status, "clCreateImage2D");
|
||||||
|
|
||||||
|
// Create space for the 7x7 filter on the device
|
||||||
|
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
|
||||||
|
NULL, &status);
|
||||||
|
chk(status, "clCreateBuffer");
|
||||||
|
|
||||||
|
// Copy the source image to the device
|
||||||
|
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
|
||||||
|
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
|
||||||
|
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
|
||||||
|
0, 0, inputImage, 0, NULL, NULL);
|
||||||
|
chk(status, "clEnqueueWriteImage");
|
||||||
|
|
||||||
|
// Copy the 7x7 filter to the device
|
||||||
|
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
|
||||||
|
filterSize*sizeof(float), filter, 0, NULL, NULL);
|
||||||
|
chk(status, "clEnqueueWriteBuffer");
|
||||||
|
|
||||||
|
// Create the image sampler
|
||||||
|
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
|
||||||
|
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
|
||||||
|
chk(status, "clCreateSampler");
|
||||||
|
|
||||||
|
const char* source = readSource("kernel.cl");
|
||||||
|
|
||||||
|
// Create a program object with source and build it
|
||||||
|
cl_program program;
|
||||||
|
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||||
|
chk(status, "clCreateProgramWithSource");
|
||||||
|
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||||
|
chk(status, "clBuildProgram");
|
||||||
|
|
||||||
|
// Create the kernel object
|
||||||
|
cl_kernel kernel;
|
||||||
|
kernel = clCreateKernel(program, "convolution", &status);
|
||||||
|
chk(status, "clCreateKernel");
|
||||||
|
|
||||||
|
// Set the kernel arguments
|
||||||
|
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
|
||||||
|
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
|
||||||
|
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
|
||||||
|
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
|
||||||
|
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
|
||||||
|
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
|
||||||
|
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
|
||||||
|
chk(status, "clSetKernelArg");
|
||||||
|
|
||||||
|
// Set the work item dimensions
|
||||||
|
size_t globalSize[2] = {imageWidth, imageHeight};
|
||||||
|
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
|
||||||
|
NULL, NULL);
|
||||||
|
chk(status, "clEnqueueNDRange");
|
||||||
|
|
||||||
|
// Read the image back to the host
|
||||||
|
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
|
||||||
|
region, 0, 0, outputImage, 0, NULL, NULL);
|
||||||
|
chk(status, "clEnqueueReadImage");
|
||||||
|
|
||||||
|
// Write the output image to file
|
||||||
|
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
|
||||||
|
|
||||||
|
// Compute the reference image
|
||||||
|
for(i = 0; i < imageHeight; i++) {
|
||||||
|
for(j = 0; j < imageWidth; j++) {
|
||||||
|
refImage[i*imageWidth+j] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate over the rows of the source image
|
||||||
|
int halfFilterWidth = filterWidth/2;
|
||||||
|
float sum;
|
||||||
|
for(i = 0; i < imageHeight; i++) {
|
||||||
|
// Iterate over the columns of the source image
|
||||||
|
for(j = 0; j < imageWidth; j++) {
|
||||||
|
sum = 0; // Reset sum for new source pixel
|
||||||
|
// Apply the filter to the neighborhood
|
||||||
|
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
|
||||||
|
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
|
||||||
|
if(i+k >= 0 && i+k < imageHeight &&
|
||||||
|
j+l >= 0 && j+l < imageWidth) {
|
||||||
|
sum += inputImage[(i+k)*imageWidth + j+l] *
|
||||||
|
filter[(k+halfFilterWidth)*filterWidth +
|
||||||
|
l+halfFilterWidth];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
refImage[i*imageWidth+j] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int failed = 0;
|
||||||
|
for(i = 0; i < imageHeight; i++) {
|
||||||
|
for(j = 0; j < imageWidth; j++) {
|
||||||
|
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
|
||||||
|
printf("Results are INCORRECT\n");
|
||||||
|
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
|
||||||
|
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
|
||||||
|
failed = 1;
|
||||||
|
}
|
||||||
|
if(failed) break;
|
||||||
|
}
|
||||||
|
if(failed) break;
|
||||||
|
}
|
||||||
|
if(!failed) {
|
||||||
|
printf("Results are correct\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
180
benchmarks/new_opencl/convolution/utils.cpp
Normal file
180
benchmarks/new_opencl/convolution/utils.cpp
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
void storeImage(float *imageOut,
|
||||||
|
const char *filename,
|
||||||
|
int rows,
|
||||||
|
int cols,
|
||||||
|
const char* refFilename) {
|
||||||
|
|
||||||
|
FILE *ifp, *ofp;
|
||||||
|
unsigned char tmp;
|
||||||
|
int offset;
|
||||||
|
unsigned char *buffer;
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
int bytes;
|
||||||
|
|
||||||
|
int height, width;
|
||||||
|
|
||||||
|
ifp = fopen(refFilename, "rb");
|
||||||
|
if(ifp == NULL) {
|
||||||
|
perror(filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(ifp, 10, SEEK_SET);
|
||||||
|
fread(&offset, 4, 1, ifp);
|
||||||
|
|
||||||
|
fseek(ifp, 18, SEEK_SET);
|
||||||
|
fread(&width, 4, 1, ifp);
|
||||||
|
fread(&height, 4, 1, ifp);
|
||||||
|
|
||||||
|
fseek(ifp, 0, SEEK_SET);
|
||||||
|
|
||||||
|
buffer = (unsigned char *)malloc(offset);
|
||||||
|
if(buffer == NULL) {
|
||||||
|
perror("malloc");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fread(buffer, 1, offset, ifp);
|
||||||
|
|
||||||
|
printf("Writing output image to %s\n", filename);
|
||||||
|
ofp = fopen(filename, "wb");
|
||||||
|
if(ofp == NULL) {
|
||||||
|
perror("opening output file");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
bytes = fwrite(buffer, 1, offset, ofp);
|
||||||
|
if(bytes != offset) {
|
||||||
|
printf("error writing header!\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE bmp formats store data in reverse raster order (see comment in
|
||||||
|
// readImage function), so we need to flip it upside down here.
|
||||||
|
int mod = width % 4;
|
||||||
|
if(mod != 0) {
|
||||||
|
mod = 4 - mod;
|
||||||
|
}
|
||||||
|
// printf("mod = %d\n", mod);
|
||||||
|
for(i = height-1; i >= 0; i--) {
|
||||||
|
for(j = 0; j < width; j++) {
|
||||||
|
tmp = (unsigned char)imageOut[i*cols+j];
|
||||||
|
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||||
|
}
|
||||||
|
// In bmp format, rows must be a multiple of 4-bytes.
|
||||||
|
// So if we're not at a multiple of 4, add junk padding.
|
||||||
|
for(j = 0; j < mod; j++) {
|
||||||
|
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(ofp);
|
||||||
|
fclose(ifp);
|
||||||
|
|
||||||
|
free(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read bmp image and convert to byte array. Also output the width and height
|
||||||
|
*/
|
||||||
|
float* readImage(const char *filename, int* widthOut, int* heightOut) {
|
||||||
|
|
||||||
|
uchar* imageData;
|
||||||
|
|
||||||
|
int height, width;
|
||||||
|
uchar tmp;
|
||||||
|
int offset;
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
printf("Reading input image from %s\n", filename);
|
||||||
|
FILE *fp = fopen(filename, "rb");
|
||||||
|
if(fp == NULL) {
|
||||||
|
perror(filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(fp, 10, SEEK_SET);
|
||||||
|
fread(&offset, 4, 1, fp);
|
||||||
|
|
||||||
|
fseek(fp, 18, SEEK_SET);
|
||||||
|
fread(&width, 4, 1, fp);
|
||||||
|
fread(&height, 4, 1, fp);
|
||||||
|
|
||||||
|
printf("width = %d\n", width);
|
||||||
|
printf("height = %d\n", height);
|
||||||
|
|
||||||
|
*widthOut = width;
|
||||||
|
*heightOut = height;
|
||||||
|
|
||||||
|
imageData = (uchar*)malloc(width*height);
|
||||||
|
if(imageData == NULL) {
|
||||||
|
perror("malloc");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
fflush(NULL);
|
||||||
|
|
||||||
|
int mod = width % 4;
|
||||||
|
if(mod != 0) {
|
||||||
|
mod = 4 - mod;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE bitmaps are stored in upside-down raster order. So we begin
|
||||||
|
// reading from the bottom left pixel, then going from left-to-right,
|
||||||
|
// read from the bottom to the top of the image. For image analysis,
|
||||||
|
// we want the image to be right-side up, so we'll modify it here.
|
||||||
|
|
||||||
|
// First we read the image in upside-down
|
||||||
|
|
||||||
|
// Read in the actual image
|
||||||
|
for(i = 0; i < height; i++) {
|
||||||
|
|
||||||
|
// add actual data to the image
|
||||||
|
for(j = 0; j < width; j++) {
|
||||||
|
fread(&tmp, sizeof(char), 1, fp);
|
||||||
|
imageData[i*width + j] = tmp;
|
||||||
|
}
|
||||||
|
// For the bmp format, each row has to be a multiple of 4,
|
||||||
|
// so I need to read in the junk data and throw it away
|
||||||
|
for(j = 0; j < mod; j++) {
|
||||||
|
fread(&tmp, sizeof(char), 1, fp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then we flip it over
|
||||||
|
int flipRow;
|
||||||
|
for(i = 0; i < height/2; i++) {
|
||||||
|
flipRow = height - (i+1);
|
||||||
|
for(j = 0; j < width; j++) {
|
||||||
|
tmp = imageData[i*width+j];
|
||||||
|
imageData[i*width+j] = imageData[flipRow*width+j];
|
||||||
|
imageData[flipRow*width+j] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
// Input image on the host
|
||||||
|
float* floatImage = NULL;
|
||||||
|
floatImage = (float*)malloc(sizeof(float)*width*height);
|
||||||
|
if(floatImage == NULL) {
|
||||||
|
perror("malloc");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert the BMP image to float (not required)
|
||||||
|
for(i = 0; i < height; i++) {
|
||||||
|
for(j = 0; j < width; j++) {
|
||||||
|
floatImage[i*width+j] = (float)imageData[i*width+j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free(imageData);
|
||||||
|
return floatImage;
|
||||||
|
}
|
||||||
11
benchmarks/new_opencl/convolution/utils.h
Normal file
11
benchmarks/new_opencl/convolution/utils.h
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#ifndef __UTILS__
|
||||||
|
#define __UTILS__
|
||||||
|
|
||||||
|
typedef unsigned char uchar;
|
||||||
|
|
||||||
|
float* readImage(const char *filename, int* widthOut, int* heightOut);
|
||||||
|
|
||||||
|
void storeImage(float *imageOut, const char *filename, int rows, int cols,
|
||||||
|
const char* refFilename);
|
||||||
|
|
||||||
|
#endif
|
||||||
1
benchmarks/new_opencl/transpose/.gitignore
vendored
Normal file
1
benchmarks/new_opencl/transpose/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
transpose
|
||||||
44
benchmarks/new_opencl/transpose/Makefile
Normal file
44
benchmarks/new_opencl/transpose/Makefile
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCLCC_PATH ?= $(wildcard ~/dev/pocl/drops_vortex_cc)
|
||||||
|
POCLRT_PATH ?= $(wildcard ..)
|
||||||
|
DRIVER_PATH ?= $(wildcard ../../../driver/sw)
|
||||||
|
|
||||||
|
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||||
|
|
||||||
|
CXXFLAGS += -I$(POCLRT_PATH)/include
|
||||||
|
|
||||||
|
LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
|
||||||
|
|
||||||
|
PROJECT = transpose
|
||||||
|
|
||||||
|
SRCS = main.cc transpose_gold.cpp
|
||||||
|
|
||||||
|
all: $(PROJECT)
|
||||||
|
|
||||||
|
kernel.pocl: kernel.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCLCC_PATH)/lib:$(DRIVER_PATH)/simx $(POCLCC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT): $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
run-fpga: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-ase: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-simx: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
run-rtlsim: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||||
|
|
||||||
|
.depend: $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(PROJECT) *.o *.dump .depend
|
||||||
|
|
||||||
|
ifneq ($(MAKECMDGOALS),clean)
|
||||||
|
-include .depend
|
||||||
|
endif
|
||||||
387
benchmarks/new_opencl/transpose/main.cc
Normal file
387
benchmarks/new_opencl/transpose/main.cc
Normal file
@@ -0,0 +1,387 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Matrix transpose with Cuda
|
||||||
|
* Host code.
|
||||||
|
|
||||||
|
* This example transposes arbitrary-size matrices. It compares a naive
|
||||||
|
* transpose kernel that suffers from non-coalesced writes, to an optimized
|
||||||
|
* transpose with fully coalesced memory access and no bank conflicts. On
|
||||||
|
* a G80 GPU, the optimized transpose can be more than 10x faster for large
|
||||||
|
* matrices.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// standard utility and system includes
|
||||||
|
#include "oclUtils.h"
|
||||||
|
#include "shrQATest.h"
|
||||||
|
|
||||||
|
#define BLOCK_DIM 16
|
||||||
|
|
||||||
|
// max GPU's to manage for multi-GPU parallel compute
|
||||||
|
const unsigned int MAX_GPU_COUNT = 8;
|
||||||
|
|
||||||
|
// global variables
|
||||||
|
cl_platform_id cpPlatform;
|
||||||
|
cl_uint uiNumDevices;
|
||||||
|
cl_device_id* cdDevices;
|
||||||
|
cl_context cxGPUContext;
|
||||||
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||||
|
cl_command_queue commandQueue[MAX_GPU_COUNT];
|
||||||
|
cl_program rv_program;
|
||||||
|
|
||||||
|
// forward declarations
|
||||||
|
// *********************************************************************
|
||||||
|
int runTest( int argc, const char** argv);
|
||||||
|
extern "C" void computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y );
|
||||||
|
|
||||||
|
// Main Program
|
||||||
|
// *********************************************************************
|
||||||
|
int main( int argc, const char** argv)
|
||||||
|
{
|
||||||
|
shrQAStart(argc, (char **)argv);
|
||||||
|
|
||||||
|
// set logfile name and start logs
|
||||||
|
shrSetLogFileName ("oclTranspose.txt");
|
||||||
|
shrLog("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
// run the main test
|
||||||
|
int result = runTest(argc, argv);
|
||||||
|
//oclCheckError(result, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
||||||
|
{
|
||||||
|
cl_mem d_odata[MAX_GPU_COUNT];
|
||||||
|
cl_mem d_idata[MAX_GPU_COUNT];
|
||||||
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||||
|
|
||||||
|
size_t szGlobalWorkSize[2];
|
||||||
|
size_t szLocalWorkSize[2];
|
||||||
|
cl_int ciErrNum;
|
||||||
|
|
||||||
|
// Create buffers for each GPU
|
||||||
|
// Each GPU will compute sizePerGPU rows of the result
|
||||||
|
size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
|
||||||
|
|
||||||
|
// size of memory required to store the matrix
|
||||||
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
// allocate device memory and copy host to device memory
|
||||||
|
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||||
|
mem_size, h_idata, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// create buffer to store output
|
||||||
|
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
|
||||||
|
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// create the naive transpose kernel
|
||||||
|
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// set the args values for the naive kernel
|
||||||
|
size_t offset = i * sizePerGPU;
|
||||||
|
ciErrNum = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
|
||||||
|
if(useLocalMem)
|
||||||
|
{
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// set up execution configuration
|
||||||
|
szLocalWorkSize[0] = BLOCK_DIM;
|
||||||
|
szLocalWorkSize[1] = BLOCK_DIM;
|
||||||
|
szGlobalWorkSize[0] = sizePerGPU;
|
||||||
|
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
|
||||||
|
|
||||||
|
// execute the kernel numIterations times
|
||||||
|
int numIterations = 100;
|
||||||
|
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
|
||||||
|
for (int i = -1; i < numIterations; ++i)
|
||||||
|
{
|
||||||
|
// Start time measurement after warmup
|
||||||
|
if( i == 0 ) shrDeltaT(0);
|
||||||
|
|
||||||
|
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||||
|
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,
|
||||||
|
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block CPU till GPU is done
|
||||||
|
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||||
|
ciErrNum |= clFinish(commandQueue[k]);
|
||||||
|
}
|
||||||
|
double time = shrDeltaT(0)/(double)numIterations;
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Copy back to host
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
size_t offset = i * sizePerGPU;
|
||||||
|
size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
|
||||||
|
|
||||||
|
ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
|
||||||
|
size * size_y * sizeof(float), &h_odata[offset * size_y],
|
||||||
|
0, NULL, NULL);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
ciErrNum |= clReleaseMemObject(d_idata[i]);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_odata[i]);
|
||||||
|
ciErrNum |= clReleaseKernel(ckKernel[i]);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
return time;
|
||||||
|
}
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
|
||||||
|
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (nullptr == filename || nullptr == data || 0 == size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
FILE* fp = fopen(filename, "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
fprintf(stderr, "Failed to load kernel.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fseek(fp , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
//! Run a simple test for CUDA
|
||||||
|
// *********************************************************************
|
||||||
|
int runTest( const int argc, const char** argv)
|
||||||
|
{
|
||||||
|
cl_int ciErrNum;
|
||||||
|
cl_uint ciDeviceCount;
|
||||||
|
unsigned int size_x = 2048;
|
||||||
|
unsigned int size_y = 2048;
|
||||||
|
|
||||||
|
int temp;
|
||||||
|
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
|
||||||
|
size_x = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
|
||||||
|
size_y = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// size of memory required to store the matrix
|
||||||
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||||
|
|
||||||
|
//Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Get the devices
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Create the context
|
||||||
|
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
|
||||||
|
{
|
||||||
|
ciDeviceCount = 0;
|
||||||
|
// User specified GPUs
|
||||||
|
char* deviceList;
|
||||||
|
char* deviceStr;
|
||||||
|
|
||||||
|
shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
char* next_token;
|
||||||
|
deviceStr = strtok_s (deviceList," ,.-", &next_token);
|
||||||
|
#else
|
||||||
|
deviceStr = strtok (deviceList," ,.-");
|
||||||
|
#endif
|
||||||
|
ciDeviceCount = 0;
|
||||||
|
while(deviceStr != NULL)
|
||||||
|
{
|
||||||
|
// get and print the device for this queue
|
||||||
|
cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
|
||||||
|
if( device == (cl_device_id)-1 ) {
|
||||||
|
shrLog(" Invalid Device: %s\n\n", deviceStr);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
shrLog("Device %d: ", atoi(deviceStr));
|
||||||
|
oclPrintDevName(LOGBOTH, device);
|
||||||
|
shrLog("\n");
|
||||||
|
|
||||||
|
// create command queue
|
||||||
|
commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
++ciDeviceCount;
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
deviceStr = strtok_s (NULL," ,.-", &next_token);
|
||||||
|
#else
|
||||||
|
deviceStr = strtok (NULL," ,.-");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
free(deviceList);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Find out how many GPU's to compute on all available GPUs
|
||||||
|
size_t nDeviceBytes;
|
||||||
|
ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
|
||||||
|
ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
|
||||||
|
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
else if (ciDeviceCount == 0)
|
||||||
|
{
|
||||||
|
shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create command-queues
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||||
|
{
|
||||||
|
// get and print the device for this queue
|
||||||
|
cl_device_id device = oclGetDev(cxGPUContext, i);
|
||||||
|
shrLog("Device %d: ", i);
|
||||||
|
oclPrintDevName(LOGBOTH, device);
|
||||||
|
shrLog("\n");
|
||||||
|
|
||||||
|
// create command queue
|
||||||
|
commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate and initalize host memory
|
||||||
|
float* h_idata = (float*)malloc(mem_size);
|
||||||
|
float* h_odata = (float*) malloc(mem_size);
|
||||||
|
srand(15235911);
|
||||||
|
shrFillArray(h_idata, (size_x * size_y));
|
||||||
|
|
||||||
|
// Program Setup
|
||||||
|
size_t program_length;
|
||||||
|
char* source_path = shrFindFilePath("transpose.cl", argv[0]);
|
||||||
|
//oclCheckError(source_path != NULL, shrTRUE);
|
||||||
|
char *source = oclLoadProgSource(source_path, "", &program_length);
|
||||||
|
//oclCheckError(source != NULL, shrTRUE);
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
cl_device_id device_id;
|
||||||
|
// create the program
|
||||||
|
rv_program = clCreateProgramWithBinary(cxGPUContext, 1, &device_id, &kernel_size, &kernel_bin, &binary_status, NULL);
|
||||||
|
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
|
||||||
|
// (const char **)&source, &program_length, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// build the program
|
||||||
|
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
// write out standard error, Build Log and PTX, then return error
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||||
|
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
|
||||||
|
return(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run Naive Kernel
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
// Matrix Copy kernel runs to measure reference performance.
|
||||||
|
double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
// log times
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// compute reference solution and cross check results
|
||||||
|
float* reference = (float*)malloc( mem_size);
|
||||||
|
computeGold( reference, h_idata, size_x, size_y);
|
||||||
|
shrLog("\nComparing results with CPU computation... \n\n");
|
||||||
|
shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
|
||||||
|
|
||||||
|
// cleanup memory
|
||||||
|
free(h_idata);
|
||||||
|
free(h_odata);
|
||||||
|
free(reference);
|
||||||
|
free(source);
|
||||||
|
free(source_path);
|
||||||
|
|
||||||
|
// cleanup OpenCL
|
||||||
|
ciErrNum = clReleaseProgram(rv_program);
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||||
|
{
|
||||||
|
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
|
||||||
|
}
|
||||||
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// pass or fail (cumulative... all tests in the loop)
|
||||||
|
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
198
benchmarks/new_opencl/transpose/oclUtils.h
Normal file
198
benchmarks/new_opencl/transpose/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include "shrUtils.h"
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/new_opencl/transpose/shrQATest.h
Normal file
238
benchmarks/new_opencl/transpose/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/new_opencl/transpose/shrUtils.h
Normal file
642
benchmarks/new_opencl/transpose/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
108
benchmarks/new_opencl/transpose/transpose.cl
Normal file
108
benchmarks/new_opencl/transpose/transpose.cl
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Matrix transpose with OpenCL
|
||||||
|
* Device code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define BLOCK_DIM 16
|
||||||
|
|
||||||
|
// This kernel is optimized to ensure all global reads and writes are coalesced,
|
||||||
|
// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster
|
||||||
|
// than the naive kernel below. Note that the shared memory array is sized to
|
||||||
|
// (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory
|
||||||
|
// so that bank conflicts do not occur when threads address the array column-wise.
|
||||||
|
__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||||
|
{
|
||||||
|
// read the matrix tile into shared memory
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if((xIndex + offset < width) && (yIndex < height))
|
||||||
|
{
|
||||||
|
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||||
|
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// write the transposed matrix tile to global memory
|
||||||
|
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
|
||||||
|
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
|
||||||
|
if((xIndex < height) && (yIndex + offset < width))
|
||||||
|
{
|
||||||
|
unsigned int index_out = yIndex * height + xIndex;
|
||||||
|
odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// This naive transpose kernel suffers from completely non-coalesced writes.
|
||||||
|
// It can be up to 10x slower than the kernel above for large matrices.
|
||||||
|
__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||||
|
unsigned int index_out = yIndex + height * xIndex;
|
||||||
|
odata[index_out] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||||
|
odata[index_in] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||||
|
{
|
||||||
|
// read the matrix tile into shared memory
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||||
|
if((xIndex + offset< width) && (yIndex < height))
|
||||||
|
{
|
||||||
|
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if((xIndex < height) && (yIndex+ offset < width))
|
||||||
|
{
|
||||||
|
odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = yIndex + height * (xIndex+ offset);
|
||||||
|
odata[index_in] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
38
benchmarks/new_opencl/transpose/transpose_gold.cpp
Normal file
38
benchmarks/new_opencl/transpose/transpose_gold.cpp
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Small Matrix transpose with Cuda (Example for a 16x16 matrix)
|
||||||
|
* Reference solution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// export C interface
|
||||||
|
extern "C"
|
||||||
|
void computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compute reference data set
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void
|
||||||
|
computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y )
|
||||||
|
{
|
||||||
|
// transpose matrix
|
||||||
|
for( unsigned int y = 0; y < size_y; ++y)
|
||||||
|
{
|
||||||
|
for( unsigned int x = 0; x < size_x; ++x)
|
||||||
|
{
|
||||||
|
reference[(x * size_y) + y] = idata[(y * size_x) + x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user