Vector mask changes
This commit is contained in:
814
benchmarks/opencl/bfs/CLHelper.h
Executable file
814
benchmarks/opencl/bfs/CLHelper.h
Executable file
@@ -0,0 +1,814 @@
|
|||||||
|
//------------------------------------------
|
||||||
|
//--cambine:helper function for OpenCL
|
||||||
|
//--programmer: Jianbin Fang
|
||||||
|
//--date: 27/12/2010
|
||||||
|
//------------------------------------------
|
||||||
|
#ifndef _CL_HELPER_
|
||||||
|
#define _CL_HELPER_
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
using std::ifstream;
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
using std::cout;
|
||||||
|
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
|
||||||
|
#define WORK_DIM 2 //work-items dimensions
|
||||||
|
|
||||||
|
struct oclHandleStruct
|
||||||
|
{
|
||||||
|
cl_context context;
|
||||||
|
cl_device_id *devices;
|
||||||
|
cl_command_queue queue;
|
||||||
|
cl_program program;
|
||||||
|
cl_int cl_status;
|
||||||
|
std::string error_str;
|
||||||
|
std::vector<cl_kernel> kernel;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct oclHandleStruct oclHandles;
|
||||||
|
|
||||||
|
char kernel_file[100] = "Kernels.cl";
|
||||||
|
int total_kernels = 2;
|
||||||
|
string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||||
|
int work_group_size = 512;
|
||||||
|
int device_id_inused = 0; //deviced id used (default : 0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Converts the contents of a file into a string
|
||||||
|
*/
|
||||||
|
string FileToString(const string fileName)
|
||||||
|
{
|
||||||
|
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
size_t size;
|
||||||
|
char* str;
|
||||||
|
string s;
|
||||||
|
|
||||||
|
if(f.is_open())
|
||||||
|
{
|
||||||
|
size_t fileSize;
|
||||||
|
f.seekg(0, ifstream::end);
|
||||||
|
size = fileSize = f.tellg();
|
||||||
|
f.seekg(0, ifstream::beg);
|
||||||
|
|
||||||
|
str = new char[size+1];
|
||||||
|
if (!str) throw(string("Could not allocate memory"));
|
||||||
|
|
||||||
|
f.read(str, fileSize);
|
||||||
|
f.close();
|
||||||
|
str[size] = '\0';
|
||||||
|
|
||||||
|
s = str;
|
||||||
|
delete [] str;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch(std::string msg)
|
||||||
|
{
|
||||||
|
cerr << "Exception caught in FileToString(): " << msg << endl;
|
||||||
|
if(f.is_open())
|
||||||
|
f.close();
|
||||||
|
}
|
||||||
|
catch(...)
|
||||||
|
{
|
||||||
|
cerr << "Exception caught in FileToString()" << endl;
|
||||||
|
if(f.is_open())
|
||||||
|
f.close();
|
||||||
|
}
|
||||||
|
string errorMsg = "FileToString()::Error: Unable to open file "
|
||||||
|
+ fileName;
|
||||||
|
throw(errorMsg);
|
||||||
|
}
|
||||||
|
//---------------------------------------
|
||||||
|
//Read command line parameters
|
||||||
|
//
|
||||||
|
void _clCmdParams(int argc, char* argv[]){
|
||||||
|
for (int i =0; i < argc; ++i)
|
||||||
|
{
|
||||||
|
switch (argv[i][1])
|
||||||
|
{
|
||||||
|
case 'g': //--g stands for size of work group
|
||||||
|
if (++i < argc)
|
||||||
|
{
|
||||||
|
sscanf(argv[i], "%u", &work_group_size);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'd': //--d stands for device id used in computaion
|
||||||
|
if (++i < argc)
|
||||||
|
{
|
||||||
|
sscanf(argv[i], "%u", &device_id_inused);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------
|
||||||
|
//Initlize CL objects
|
||||||
|
//--description: there are 5 steps to initialize all the OpenCL objects needed
|
||||||
|
//--revised on 04/01/2011: get the number of devices and
|
||||||
|
// devices have no relationship with context
|
||||||
|
void _clInit()
|
||||||
|
{
|
||||||
|
int DEVICE_ID_INUSED = device_id_inused;
|
||||||
|
cl_int resultCL;
|
||||||
|
|
||||||
|
oclHandles.context = NULL;
|
||||||
|
oclHandles.devices = NULL;
|
||||||
|
oclHandles.queue = NULL;
|
||||||
|
oclHandles.program = NULL;
|
||||||
|
|
||||||
|
cl_uint deviceListSize;
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-1: find the available platforms and select one
|
||||||
|
|
||||||
|
cl_uint numPlatforms;
|
||||||
|
cl_platform_id targetPlatform = NULL;
|
||||||
|
|
||||||
|
resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
|
||||||
|
//printf("number of platforms:%d\n",numPlatforms); //by cambine
|
||||||
|
|
||||||
|
if (!(numPlatforms > 0))
|
||||||
|
throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
|
||||||
|
|
||||||
|
cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
|
||||||
|
|
||||||
|
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
|
||||||
|
|
||||||
|
/* Select the target platform. Default: first platform */
|
||||||
|
targetPlatform = allPlatforms[0];
|
||||||
|
for (int i = 0; i < numPlatforms; i++)
|
||||||
|
{
|
||||||
|
char pbuff[128];
|
||||||
|
resultCL = clGetPlatformInfo( allPlatforms[i],
|
||||||
|
CL_PLATFORM_VENDOR,
|
||||||
|
sizeof(pbuff),
|
||||||
|
pbuff,
|
||||||
|
NULL);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
|
||||||
|
|
||||||
|
//printf("vedor is %s\n",pbuff);
|
||||||
|
|
||||||
|
}
|
||||||
|
free(allPlatforms);
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-2: create an OpenCL context
|
||||||
|
cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
|
||||||
|
oclHandles.context = clCreateContextFromType(cprops,
|
||||||
|
CL_DEVICE_TYPE_GPU,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||||
|
throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-3: detect OpenCL devices
|
||||||
|
/* First, get the size of device list */
|
||||||
|
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("exception in _clInit -> clGetDeviceIDs"));
|
||||||
|
}
|
||||||
|
if (deviceListSize == 0)
|
||||||
|
throw(string("InitCL()::Error: No devices found."));
|
||||||
|
|
||||||
|
//std::cout<<"device number:"<<deviceListSize<<std::endl;
|
||||||
|
|
||||||
|
/* Now, allocate the device list */
|
||||||
|
oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
|
||||||
|
|
||||||
|
if (oclHandles.devices == 0)
|
||||||
|
throw(string("InitCL()::Error: Could not allocate memory."));
|
||||||
|
|
||||||
|
/* Next, get the device list data */
|
||||||
|
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, deviceListSize, \
|
||||||
|
oclHandles.devices, NULL);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
|
||||||
|
}
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-4: Create an OpenCL command queue
|
||||||
|
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
|
||||||
|
oclHandles.devices[DEVICE_ID_INUSED],
|
||||||
|
0,
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
|
||||||
|
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-5: Load CL file, build CL program object, create CL kernel object
|
||||||
|
std::string source_str = FileToString(kernel_file);
|
||||||
|
const char * source = source_str.c_str();
|
||||||
|
size_t sourceSize[] = { source_str.length() };
|
||||||
|
|
||||||
|
oclHandles.program = clCreateProgramWithSource(oclHandles.context,
|
||||||
|
1,
|
||||||
|
&source,
|
||||||
|
sourceSize,
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||||
|
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));
|
||||||
|
//insert debug information
|
||||||
|
//std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
|
||||||
|
//options += " -cl-nv-opt-level=3";
|
||||||
|
resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||||
|
{
|
||||||
|
cerr << "InitCL()::Error: In clBuildProgram" << endl;
|
||||||
|
|
||||||
|
size_t length;
|
||||||
|
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||||
|
oclHandles.devices[DEVICE_ID_INUSED],
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
&length);
|
||||||
|
if(resultCL != CL_SUCCESS)
|
||||||
|
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
|
||||||
|
|
||||||
|
char* buffer = (char*)malloc(length);
|
||||||
|
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||||
|
oclHandles.devices[DEVICE_ID_INUSED],
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
length,
|
||||||
|
buffer,
|
||||||
|
NULL);
|
||||||
|
if(resultCL != CL_SUCCESS)
|
||||||
|
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
|
||||||
|
|
||||||
|
cerr << buffer << endl;
|
||||||
|
free(buffer);
|
||||||
|
|
||||||
|
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
|
||||||
|
}
|
||||||
|
|
||||||
|
//get program information in intermediate representation
|
||||||
|
#ifdef PTX_MSG
|
||||||
|
size_t binary_sizes[deviceListSize];
|
||||||
|
char * binaries[deviceListSize];
|
||||||
|
//figure out number of devices and the sizes of the binary for each device.
|
||||||
|
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<"--cambine:"<<binary_sizes<<std::endl;
|
||||||
|
//copy over all of the generated binaries.
|
||||||
|
for(int i=0;i<deviceListSize;i++)
|
||||||
|
binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
|
||||||
|
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
|
||||||
|
}
|
||||||
|
for(int i=0;i<deviceListSize;i++)
|
||||||
|
binaries[i][binary_sizes[i]] = '\0';
|
||||||
|
std::cout<<"--cambine:writing ptd information..."<<std::endl;
|
||||||
|
FILE * ptx_file = fopen("cl.ptx","w");
|
||||||
|
if(ptx_file==NULL){
|
||||||
|
throw(string("exceptions in allocate ptx file."));
|
||||||
|
}
|
||||||
|
fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSED]);
|
||||||
|
fclose(ptx_file);
|
||||||
|
std::cout<<"--cambine:writing ptd information done."<<std::endl;
|
||||||
|
for(int i=0;i<deviceListSize;i++)
|
||||||
|
free(binaries[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (int nKernel = 0; nKernel < total_kernels; nKernel++)
|
||||||
|
{
|
||||||
|
/* get a kernel object handle for a kernel with the given name */
|
||||||
|
cl_kernel kernel = clCreateKernel(oclHandles.program,
|
||||||
|
(kernel_names[nKernel]).c_str(),
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (kernel == NULL))
|
||||||
|
{
|
||||||
|
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
|
||||||
|
throw(errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
oclHandles.kernel.push_back(kernel);
|
||||||
|
}
|
||||||
|
//get resource alocation information
|
||||||
|
#ifdef RES_MSG
|
||||||
|
char * build_log;
|
||||||
|
size_t ret_val_size;
|
||||||
|
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("exceptions in _InitCL -> getting resource information"));
|
||||||
|
}
|
||||||
|
|
||||||
|
build_log = (char *)malloc(ret_val_size+1);
|
||||||
|
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
|
||||||
|
}
|
||||||
|
build_log[ret_val_size] = '\0';
|
||||||
|
std::cout<<"--cambine:"<<build_log<<std::endl;
|
||||||
|
free(build_log);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------
|
||||||
|
//release CL objects
|
||||||
|
void _clRelease()
|
||||||
|
{
|
||||||
|
char errorFlag = false;
|
||||||
|
|
||||||
|
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++)
|
||||||
|
{
|
||||||
|
if (oclHandles.kernel[nKernel] != NULL)
|
||||||
|
{
|
||||||
|
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.kernel[nKernel] = NULL;
|
||||||
|
}
|
||||||
|
oclHandles.kernel.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oclHandles.program != NULL)
|
||||||
|
{
|
||||||
|
cl_int resultCL = clReleaseProgram(oclHandles.program);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.program = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oclHandles.queue != NULL)
|
||||||
|
{
|
||||||
|
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.queue = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(oclHandles.devices);
|
||||||
|
|
||||||
|
if (oclHandles.context != NULL)
|
||||||
|
{
|
||||||
|
cl_int resultCL = clReleaseContext(oclHandles.context);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.context = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create buffer and then copy data from host to device
|
||||||
|
cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string){
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, \
|
||||||
|
size, h_mem_source, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem()"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: create read only buffer for devices
|
||||||
|
//--date: 17/01/2011
|
||||||
|
cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string){
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMallocRW"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: create read and write buffer for devices
|
||||||
|
//--date: 17/01/2011
|
||||||
|
cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string){
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMalloc"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: transfer data from host to device
|
||||||
|
//--date: 17/01/2011
|
||||||
|
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string){
|
||||||
|
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMemcpyH2D"));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create buffer and then copy data from host to device with pinned
|
||||||
|
// memory
|
||||||
|
cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string){
|
||||||
|
cl_mem d_mem, d_mem_pinned;
|
||||||
|
float * h_mem_pinned = NULL;
|
||||||
|
d_mem_pinned = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, \
|
||||||
|
size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
|
||||||
|
#endif
|
||||||
|
//------------
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \
|
||||||
|
size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
|
||||||
|
#endif
|
||||||
|
//----------
|
||||||
|
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \
|
||||||
|
CL_MAP_WRITE, 0, size, 0, NULL, \
|
||||||
|
NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
|
||||||
|
#endif
|
||||||
|
int element_number = size/sizeof(float);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int i=0;i<element_number;i++){
|
||||||
|
h_mem_pinned[i] = h_mem_source[i];
|
||||||
|
}
|
||||||
|
//----------
|
||||||
|
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, \
|
||||||
|
CL_TRUE, 0, size, h_mem_pinned, \
|
||||||
|
0, NULL, NULL);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create write only buffer on device
|
||||||
|
cl_mem _clMallocWO(int size) throw(string){
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateMem()"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//transfer data from device to host
|
||||||
|
void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){
|
||||||
|
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
|
||||||
|
switch(oclHandles.cl_status){
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//set kernel arguments
|
||||||
|
void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){
|
||||||
|
if(!size){
|
||||||
|
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||||
|
switch(oclHandles.cl_status){
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_INDEX:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_SAMPLER:
|
||||||
|
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||||
|
switch(oclHandles.cl_status){
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_INDEX:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_SAMPLER:
|
||||||
|
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void _clFinish() throw(string){
|
||||||
|
oclHandles.cl_status = clFinish(oclHandles.queue);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clFinish";
|
||||||
|
switch(oclHandles.cl_status){
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reasons";
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:enqueue kernel
|
||||||
|
void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){
|
||||||
|
cl_uint work_dim = WORK_DIM;
|
||||||
|
cl_event e[1];
|
||||||
|
if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
|
||||||
|
work_items = work_items + (work_group_size-(work_items%work_group_size));
|
||||||
|
size_t local_work_size[] = {work_group_size, 1};
|
||||||
|
size_t global_work_size[] = {work_items, 1};
|
||||||
|
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
|
||||||
|
global_work_size, local_work_size, 0 , 0, &(e[0]) );
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||||
|
switch(oclHandles.cl_status)
|
||||||
|
{
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL_ARGS:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_DIMENSION:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
//_clFinish();
|
||||||
|
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||||
|
// #ifdef ERRMSG
|
||||||
|
// if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||||
|
// #endif
|
||||||
|
}
|
||||||
|
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){
|
||||||
|
cl_uint work_dim = WORK_DIM;
|
||||||
|
size_t local_work_size[] = {group_x, group_y};
|
||||||
|
size_t global_work_size[] = {range_x, range_y};
|
||||||
|
cl_event e[1];
|
||||||
|
/*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
|
||||||
|
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
|
||||||
|
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
|
||||||
|
global_work_size, local_work_size, 0 , 0, &(e[0]) );
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||||
|
switch(oclHandles.cl_status)
|
||||||
|
{
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL_ARGS:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_DIMENSION:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
//_clFinish();
|
||||||
|
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||||
|
|
||||||
|
#ifdef ERRMSG
|
||||||
|
|
||||||
|
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
|
||||||
|
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||||
|
|
||||||
|
#endif*/
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//release OpenCL objects
|
||||||
|
void _clFree(cl_mem ob) throw(string){
|
||||||
|
if(ob!=NULL)
|
||||||
|
oclHandles.cl_status = clReleaseMemObject(ob);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clFree() ->";
|
||||||
|
switch(oclHandles.cl_status)
|
||||||
|
{
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif //_CL_HELPER_
|
||||||
@@ -29,12 +29,20 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio
|
|||||||
|
|
||||||
LIBS = -lOpenCL
|
LIBS = -lOpenCL
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
PROJECT=saxpy
|
PROJECT=saxpy
|
||||||
|
=======
|
||||||
|
PROJECT=bfs
|
||||||
|
>>>>>>> f3700051a4da6cd017e5ce41f2732f3fc3e86e2d
|
||||||
|
|
||||||
all: $(PROJECT).dump $(PROJECT).hex
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
lib$(PROJECT).a: kernel.cl
|
lib$(PROJECT).a: kernel.cl
|
||||||
|
<<<<<<< HEAD
|
||||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||||
|
=======
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
>>>>>>> f3700051a4da6cd017e5ce41f2732f3fc3e86e2d
|
||||||
|
|
||||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf
|
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf
|
||||||
|
|||||||
53
benchmarks/opencl/bfs/kernel.cl
Executable file
53
benchmarks/opencl/bfs/kernel.cl
Executable file
@@ -0,0 +1,53 @@
|
|||||||
|
/* ============================================================
|
||||||
|
//--cambine: kernel funtion of Breadth-First-Search
|
||||||
|
//--author: created by Jianbin Fang
|
||||||
|
//--date: 06/12/2010
|
||||||
|
============================================================ */
|
||||||
|
|
||||||
|
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
|
||||||
|
|
||||||
|
//Structure to hold a node information
|
||||||
|
typedef struct{
|
||||||
|
int starting;
|
||||||
|
int no_of_edges;
|
||||||
|
} Node;
|
||||||
|
|
||||||
|
//--7 parameters
|
||||||
|
__kernel void BFS_1( const __global Node* g_graph_nodes,
|
||||||
|
const __global int* g_graph_edges,
|
||||||
|
__global char* g_graph_mask,
|
||||||
|
__global char* g_updating_graph_mask,
|
||||||
|
__global char* g_graph_visited,
|
||||||
|
__global int* g_cost,
|
||||||
|
const int no_of_nodes){
|
||||||
|
int tid = get_global_id(0);
|
||||||
|
if( tid<no_of_nodes && g_graph_mask[tid]){
|
||||||
|
g_graph_mask[tid]=false;
|
||||||
|
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
|
||||||
|
int id = g_graph_edges[i];
|
||||||
|
if(!g_graph_visited[id]){
|
||||||
|
g_cost[id]=g_cost[tid]+1;
|
||||||
|
g_updating_graph_mask[id]=true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--5 parameters
|
||||||
|
__kernel void BFS_2(__global char* g_graph_mask,
|
||||||
|
__global char* g_updating_graph_mask,
|
||||||
|
__global char* g_graph_visited,
|
||||||
|
__global char* g_over,
|
||||||
|
const int no_of_nodes
|
||||||
|
) {
|
||||||
|
int tid = get_global_id(0);
|
||||||
|
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
|
||||||
|
|
||||||
|
g_graph_mask[tid]=true;
|
||||||
|
g_graph_visited[tid]=true;
|
||||||
|
*g_over=true;
|
||||||
|
g_updating_graph_mask[tid]=false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BIN
benchmarks/opencl/bfs/libbfs.a
Normal file
BIN
benchmarks/opencl/bfs/libbfs.a
Normal file
Binary file not shown.
299
benchmarks/opencl/bfs/main.cc
Executable file
299
benchmarks/opencl/bfs/main.cc
Executable file
@@ -0,0 +1,299 @@
|
|||||||
|
//--by Jianbin Fang
|
||||||
|
|
||||||
|
#define __CL_ENABLE_EXCEPTIONS
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#ifdef PROFILING
|
||||||
|
#include "timer.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "CLHelper.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
#define MAX_THREADS_PER_BLOCK 256
|
||||||
|
|
||||||
|
//Structure to hold a node information
|
||||||
|
struct Node
|
||||||
|
{
|
||||||
|
int starting;
|
||||||
|
int no_of_edges;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--bfs on cpu
|
||||||
|
//--programmer: jianbin
|
||||||
|
//--date: 26/01/2011
|
||||||
|
//--note: width is changed to the new_width
|
||||||
|
//----------------------------------------------------------
|
||||||
|
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
|
||||||
|
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
|
||||||
|
char *h_graph_visited, int *h_cost_ref){
|
||||||
|
char stop;
|
||||||
|
int k = 0;
|
||||||
|
do{
|
||||||
|
//if no thread changes this value then the loop stops
|
||||||
|
stop=false;
|
||||||
|
for(int tid = 0; tid < no_of_nodes; tid++ )
|
||||||
|
{
|
||||||
|
if (h_graph_mask[tid] == true){
|
||||||
|
h_graph_mask[tid]=false;
|
||||||
|
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){
|
||||||
|
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid
|
||||||
|
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below
|
||||||
|
h_cost_ref[id]=h_cost_ref[tid]+1;
|
||||||
|
h_updating_graph_mask[id]=true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int tid=0; tid< no_of_nodes ; tid++ )
|
||||||
|
{
|
||||||
|
if (h_updating_graph_mask[tid] == true){
|
||||||
|
h_graph_mask[tid]=true;
|
||||||
|
h_graph_visited[tid]=true;
|
||||||
|
stop=true;
|
||||||
|
h_updating_graph_mask[tid]=false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
while(stop);
|
||||||
|
}
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--breadth first search on GPUs
|
||||||
|
//----------------------------------------------------------
|
||||||
|
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
|
||||||
|
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
|
||||||
|
char *h_graph_visited, int *h_cost)
|
||||||
|
throw(std::string){
|
||||||
|
|
||||||
|
//int number_elements = height*width;
|
||||||
|
char h_over;
|
||||||
|
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
|
||||||
|
d_graph_visited, d_cost, d_over;
|
||||||
|
try{
|
||||||
|
//--1 transfer data from host to device
|
||||||
|
_clInit();
|
||||||
|
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
|
||||||
|
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
|
||||||
|
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
|
||||||
|
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
|
||||||
|
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
|
||||||
|
|
||||||
|
|
||||||
|
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
|
||||||
|
d_over = _clMallocRW(sizeof(char), &h_over);
|
||||||
|
|
||||||
|
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
|
||||||
|
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
|
||||||
|
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
|
||||||
|
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
|
||||||
|
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
|
||||||
|
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
|
||||||
|
|
||||||
|
//--2 invoke kernel
|
||||||
|
#ifdef PROFILING
|
||||||
|
timer kernel_timer;
|
||||||
|
double kernel_time = 0.0;
|
||||||
|
kernel_timer.reset();
|
||||||
|
kernel_timer.start();
|
||||||
|
#endif
|
||||||
|
do{
|
||||||
|
h_over = false;
|
||||||
|
_clMemcpyH2D(d_over, sizeof(char), &h_over);
|
||||||
|
//--kernel 0
|
||||||
|
int kernel_id = 0;
|
||||||
|
int kernel_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_cost);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||||
|
|
||||||
|
//int work_items = no_of_nodes;
|
||||||
|
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||||
|
|
||||||
|
//--kernel 1
|
||||||
|
kernel_id = 1;
|
||||||
|
kernel_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_over);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||||
|
|
||||||
|
//work_items = no_of_nodes;
|
||||||
|
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||||
|
|
||||||
|
_clMemcpyD2H(d_over,sizeof(char), &h_over);
|
||||||
|
}while(h_over);
|
||||||
|
|
||||||
|
_clFinish();
|
||||||
|
#ifdef PROFILING
|
||||||
|
kernel_timer.stop();
|
||||||
|
kernel_time = kernel_timer.getTimeInSeconds();
|
||||||
|
#endif
|
||||||
|
//--3 transfer data from device to host
|
||||||
|
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
|
||||||
|
//--statistics
|
||||||
|
#ifdef PROFILING
|
||||||
|
std::cout<<"kernel time(s):"<<kernel_time<<std::endl;
|
||||||
|
#endif
|
||||||
|
//--4 release cl resources.
|
||||||
|
_clFree(d_graph_nodes);
|
||||||
|
_clFree(d_graph_edges);
|
||||||
|
_clFree(d_graph_mask);
|
||||||
|
_clFree(d_updating_graph_mask);
|
||||||
|
_clFree(d_graph_visited);
|
||||||
|
_clFree(d_cost);
|
||||||
|
_clFree(d_over);
|
||||||
|
_clRelease();
|
||||||
|
}
|
||||||
|
catch(std::string msg){
|
||||||
|
_clFree(d_graph_nodes);
|
||||||
|
_clFree(d_graph_edges);
|
||||||
|
_clFree(d_graph_mask);
|
||||||
|
_clFree(d_updating_graph_mask);
|
||||||
|
_clFree(d_graph_visited);
|
||||||
|
_clFree(d_cost);
|
||||||
|
_clFree(d_over);
|
||||||
|
_clRelease();
|
||||||
|
std::string e_str = "in run_transpose_gpu -> ";
|
||||||
|
e_str += msg;
|
||||||
|
throw(e_str);
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
void Usage(int argc, char**argv){
|
||||||
|
|
||||||
|
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
|
||||||
|
|
||||||
|
}
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--cambine: main function
|
||||||
|
//--author: created by Jianbin Fang
|
||||||
|
//--date: 25/01/2011
|
||||||
|
//----------------------------------------------------------
|
||||||
|
int main(int argc, char * argv[])
|
||||||
|
{
|
||||||
|
int no_of_nodes;
|
||||||
|
int edge_list_size;
|
||||||
|
FILE *fp;
|
||||||
|
Node* h_graph_nodes;
|
||||||
|
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
|
||||||
|
try{
|
||||||
|
char *input_f;
|
||||||
|
if(argc!=2){
|
||||||
|
Usage(argc, argv);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
input_f = argv[1];
|
||||||
|
printf("Reading File\n");
|
||||||
|
//Read in Graph from a file
|
||||||
|
fp = fopen(input_f,"r");
|
||||||
|
if(!fp){
|
||||||
|
printf("Error Reading graph file\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int source = 0;
|
||||||
|
|
||||||
|
fscanf(fp,"%d",&no_of_nodes);
|
||||||
|
|
||||||
|
int num_of_blocks = 1;
|
||||||
|
int num_of_threads_per_block = no_of_nodes;
|
||||||
|
|
||||||
|
//Make execution Parameters according to the number of nodes
|
||||||
|
//Distribute threads across multiple Blocks if necessary
|
||||||
|
if(no_of_nodes>MAX_THREADS_PER_BLOCK){
|
||||||
|
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
|
||||||
|
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||||
|
}
|
||||||
|
work_group_size = num_of_threads_per_block;
|
||||||
|
// allocate host memory
|
||||||
|
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
|
||||||
|
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
|
||||||
|
int start, edgeno;
|
||||||
|
// initalize the memory
|
||||||
|
for(int i = 0; i < no_of_nodes; i++){
|
||||||
|
fscanf(fp,"%d %d",&start,&edgeno);
|
||||||
|
h_graph_nodes[i].starting = start;
|
||||||
|
h_graph_nodes[i].no_of_edges = edgeno;
|
||||||
|
h_graph_mask[i]=false;
|
||||||
|
h_updating_graph_mask[i]=false;
|
||||||
|
h_graph_visited[i]=false;
|
||||||
|
}
|
||||||
|
//read the source node from the file
|
||||||
|
fscanf(fp,"%d",&source);
|
||||||
|
source=0;
|
||||||
|
//set the source node as true in the mask
|
||||||
|
h_graph_mask[source]=true;
|
||||||
|
h_graph_visited[source]=true;
|
||||||
|
fscanf(fp,"%d",&edge_list_size);
|
||||||
|
int id,cost;
|
||||||
|
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
|
||||||
|
for(int i=0; i < edge_list_size ; i++){
|
||||||
|
fscanf(fp,"%d",&id);
|
||||||
|
fscanf(fp,"%d",&cost);
|
||||||
|
h_graph_edges[i] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(fp)
|
||||||
|
fclose(fp);
|
||||||
|
// allocate mem for the result on host side
|
||||||
|
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
|
||||||
|
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
|
||||||
|
for(int i=0;i<no_of_nodes;i++){
|
||||||
|
h_cost[i]=-1;
|
||||||
|
h_cost_ref[i] = -1;
|
||||||
|
}
|
||||||
|
h_cost[source]=0;
|
||||||
|
h_cost_ref[source]=0;
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--gpu entry
|
||||||
|
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--cpu entry
|
||||||
|
// initalize the memory again
|
||||||
|
for(int i = 0; i < no_of_nodes; i++){
|
||||||
|
h_graph_mask[i]=false;
|
||||||
|
h_updating_graph_mask[i]=false;
|
||||||
|
h_graph_visited[i]=false;
|
||||||
|
}
|
||||||
|
//set the source node as true in the mask
|
||||||
|
source=0;
|
||||||
|
h_graph_mask[source]=true;
|
||||||
|
h_graph_visited[source]=true;
|
||||||
|
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--result varification
|
||||||
|
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
|
||||||
|
//release host memory
|
||||||
|
free(h_graph_nodes);
|
||||||
|
free(h_graph_mask);
|
||||||
|
free(h_updating_graph_mask);
|
||||||
|
free(h_graph_visited);
|
||||||
|
|
||||||
|
}
|
||||||
|
catch(std::string msg){
|
||||||
|
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl;
|
||||||
|
//release host memory
|
||||||
|
free(h_graph_nodes);
|
||||||
|
free(h_graph_mask);
|
||||||
|
free(h_updating_graph_mask);
|
||||||
|
free(h_graph_visited);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
1
benchmarks/opencl/bfs/run
Executable file
1
benchmarks/opencl/bfs/run
Executable file
@@ -0,0 +1 @@
|
|||||||
|
./bfs ../../data/bfs/graph1MW_6.txt
|
||||||
78
benchmarks/opencl/bfs/timer.cc
Executable file
78
benchmarks/opencl/bfs/timer.cc
Executable file
@@ -0,0 +1,78 @@
|
|||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
|
#include "timer.h"
|
||||||
|
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
|
||||||
|
|
||||||
|
|
||||||
|
double timer::get_CPU_speed_in_MHz()
|
||||||
|
{
|
||||||
|
#if defined __linux__
|
||||||
|
ifstream infile("/proc/cpuinfo");
|
||||||
|
char buffer[256], *colon;
|
||||||
|
|
||||||
|
while (infile.good()) {
|
||||||
|
infile.getline(buffer, 256);
|
||||||
|
|
||||||
|
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
|
||||||
|
return atof(colon + 2);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void timer::print_time(ostream &str, const char *which, double time) const
|
||||||
|
{
|
||||||
|
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
|
||||||
|
const char **unit = units;
|
||||||
|
|
||||||
|
time = 1000.0 * time / CPU_speed_in_MHz;
|
||||||
|
|
||||||
|
while (time >= 999.5 && unit[1] != 0) {
|
||||||
|
time /= 1000.0;
|
||||||
|
++ unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ostream &timer::print(ostream &str)
|
||||||
|
{
|
||||||
|
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
|
||||||
|
|
||||||
|
if (CPU_speed_in_MHz == 0)
|
||||||
|
str << "could not determine CPU speed\n";
|
||||||
|
else if (count > 0) {
|
||||||
|
double total = static_cast<double>(total_time);
|
||||||
|
|
||||||
|
print_time(str, "avg", total / static_cast<double>(count));
|
||||||
|
print_time(str, ", total", total);
|
||||||
|
str << ", count = " << setw(9) << count << '\n';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
str << "not used\n";
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ostream &operator << (ostream &str, class timer &timer)
|
||||||
|
{
|
||||||
|
return timer.print(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
double timer::getTimeInSeconds()
|
||||||
|
{
|
||||||
|
double total = static_cast<double>(total_time);
|
||||||
|
double res = (total / 1000000.0) / CPU_speed_in_MHz;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
128
benchmarks/opencl/bfs/timer.h
Executable file
128
benchmarks/opencl/bfs/timer.h
Executable file
@@ -0,0 +1,128 @@
|
|||||||
|
#ifndef timer_h
|
||||||
|
#define timer_h
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
class timer {
|
||||||
|
public:
|
||||||
|
timer(const char *name = 0);
|
||||||
|
timer(const char *name, std::ostream &write_on_exit);
|
||||||
|
|
||||||
|
~timer();
|
||||||
|
|
||||||
|
void start(), stop();
|
||||||
|
void reset();
|
||||||
|
std::ostream &print(std::ostream &);
|
||||||
|
|
||||||
|
double getTimeInSeconds();
|
||||||
|
|
||||||
|
private:
|
||||||
|
void print_time(std::ostream &, const char *which, double time) const;
|
||||||
|
|
||||||
|
union {
|
||||||
|
long long total_time;
|
||||||
|
struct {
|
||||||
|
#if defined __PPC__
|
||||||
|
int high, low;
|
||||||
|
#else
|
||||||
|
int low, high;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned long long count;
|
||||||
|
const char *const name;
|
||||||
|
std::ostream *const write_on_exit;
|
||||||
|
|
||||||
|
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
std::ostream &operator << (std::ostream &, class timer &);
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::reset()
|
||||||
|
{
|
||||||
|
total_time = 0;
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::timer(const char *name)
|
||||||
|
:
|
||||||
|
name(name),
|
||||||
|
write_on_exit(0)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::timer(const char *name, std::ostream &write_on_exit)
|
||||||
|
:
|
||||||
|
name(name),
|
||||||
|
write_on_exit(&write_on_exit)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::~timer()
|
||||||
|
{
|
||||||
|
if (write_on_exit != 0)
|
||||||
|
print(*write_on_exit);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::start()
|
||||||
|
{
|
||||||
|
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||||
|
unsigned eax, edx;
|
||||||
|
|
||||||
|
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
|
||||||
|
|
||||||
|
total_time -= ((unsigned long long) edx << 32) + eax;
|
||||||
|
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"rdtsc\n\t"
|
||||||
|
"subl %%eax, %0\n\t"
|
||||||
|
"sbbl %%edx, %1"
|
||||||
|
:
|
||||||
|
"+m" (low), "+m" (high)
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"eax", "edx"
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
#error Compiler/Architecture not recognized
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::stop()
|
||||||
|
{
|
||||||
|
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||||
|
unsigned eax, edx;
|
||||||
|
|
||||||
|
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
|
||||||
|
|
||||||
|
total_time += ((unsigned long long) edx << 32) + eax;
|
||||||
|
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"rdtsc\n\t"
|
||||||
|
"addl %%eax, %0\n\t"
|
||||||
|
"adcl %%edx, %1"
|
||||||
|
:
|
||||||
|
"+m" (low), "+m" (high)
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"eax", "edx"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
++ count;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
72
benchmarks/opencl/bfs/util.h
Executable file
72
benchmarks/opencl/bfs/util.h
Executable file
@@ -0,0 +1,72 @@
|
|||||||
|
#ifndef _C_UTIL_
|
||||||
|
#define _C_UTIL_
|
||||||
|
#include <math.h>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--initialize array with maximum limit
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
template<typename datatype>
|
||||||
|
void fill(datatype *A, const int n, const datatype maxi){
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--print matrix
|
||||||
|
template<typename datatype>
|
||||||
|
void print_matrix(datatype *A, int height, int width){
|
||||||
|
for(int i=0; i<height; i++){
|
||||||
|
for(int j=0; j<width; j++){
|
||||||
|
int idx = i*width + j;
|
||||||
|
std::cout<<A[idx]<<" ";
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--verify results
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
#define MAX_RELATIVE_ERROR .002
|
||||||
|
template<typename datatype>
|
||||||
|
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
|
||||||
|
|
||||||
|
char passed = true;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
std::cout << "--cambine:passed:-)" << endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout << "--cambine: failed:-(" << endl;
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
template<typename datatype>
|
||||||
|
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
|
||||||
|
|
||||||
|
char passed = true;
|
||||||
|
//#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (cpu_results[i]!=gpu_results[i]){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
std::cout << "--cambine:passed:-)" << endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout << "--cambine: failed:-(" << endl;
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
429896
emulator/emulator.debug
429896
emulator/emulator.debug
File diff suppressed because it is too large
Load Diff
BIN
emulator/enc.o
BIN
emulator/enc.o
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -53,32 +53,18 @@ namespace Harp {
|
|||||||
};
|
};
|
||||||
|
|
||||||
class Core;
|
class Core;
|
||||||
// class ConsoleMemDevice : public MemDevice {
|
class ConsoleMemDevice : public MemDevice {
|
||||||
// public:
|
public:
|
||||||
// ConsoleMemDevice(Size wS, std::ostream &o, Core &core, bool batch = false);
|
ConsoleMemDevice(Size wS, std::ostream &o, Core &core, bool batch = false) {}
|
||||||
// ~ConsoleMemDevice() {}
|
~ConsoleMemDevice() {}
|
||||||
|
|
||||||
// //virtual Size wordSize() const { return wordSize; }
|
//virtual Size wordSize() const { return wordSize; }
|
||||||
// virtual Size size() const { return wordSize; }
|
virtual Size size() const { return 1; }
|
||||||
// virtual Word read(Addr) { pthread_mutex_lock(&cBufLock);
|
virtual Word read(Addr) { Word(5); }
|
||||||
// char c = cBuf.front();
|
virtual void write(Addr a, Word w) { }
|
||||||
// cBuf.pop();
|
|
||||||
// pthread_mutex_unlock(&cBufLock);
|
|
||||||
// return Word(c); }
|
|
||||||
// virtual void write(Addr a, Word w) { output << char(w); }
|
|
||||||
|
|
||||||
// void poll();
|
void poll() {}
|
||||||
|
};
|
||||||
// friend void *Harp::consoleInputThread(void *);
|
|
||||||
|
|
||||||
// private:
|
|
||||||
// std::ostream &output;
|
|
||||||
// Size wordSize;
|
|
||||||
// Core &core;
|
|
||||||
|
|
||||||
// std::queue<char> cBuf;
|
|
||||||
// pthread_mutex_t cBufLock;
|
|
||||||
// };
|
|
||||||
|
|
||||||
class DiskControllerMemDevice : public MemDevice {
|
class DiskControllerMemDevice : public MemDevice {
|
||||||
public:
|
public:
|
||||||
|
|||||||
@@ -578,9 +578,15 @@ void Instruction::executeOn(Warp &c) {
|
|||||||
reg[rdest] = ((immsrc << 12) & 0xfffff000) + (c.pc - 4);
|
reg[rdest] = ((immsrc << 12) & 0xfffff000) + (c.pc - 4);
|
||||||
break;
|
break;
|
||||||
case JAL_INST:
|
case JAL_INST:
|
||||||
//std::cout << "JAL_INST\n";
|
std::cout << "JAL_INST\n";
|
||||||
if (!pcSet) nextPc = (c.pc - 4) + immsrc;
|
if (!pcSet) nextPc = (c.pc - 4) + immsrc;
|
||||||
if (!pcSet) {/*std::cout << "JAL... SETTING PC: " << nextPc << "\n"; */}
|
|
||||||
|
if (!pcSet)
|
||||||
|
{
|
||||||
|
std::cout << "JAL... immsrc: " << hex << immsrc << "\n";
|
||||||
|
std::cout << "JAL... pc base: " << hex << (c.pc - 4) << "\n";
|
||||||
|
std::cout << "JAL... SETTING PC: " << nextPc << "\n";
|
||||||
|
}
|
||||||
if (rdest != 0)
|
if (rdest != 0)
|
||||||
{
|
{
|
||||||
reg[rdest] = c.pc;
|
reg[rdest] = c.pc;
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
emulator/mem.o
BIN
emulator/mem.o
Binary file not shown.
@@ -2,4 +2,5 @@ echo start > results.txt
|
|||||||
|
|
||||||
# echo ../kernel/vortex_test.hex
|
# echo ../kernel/vortex_test.hex
|
||||||
./harptool -E -a rv32i --core ../runtime/mains/simple/vx_simple_main.hex -s -b 1> emulator.debug
|
./harptool -E -a rv32i --core ../runtime/mains/simple/vx_simple_main.hex -s -b 1> emulator.debug
|
||||||
|
# ./harptool -E -a rv32i --core ../benchmarks/opencl/sgemm/sgemm.hex -s -b 1> emulator.debug
|
||||||
# ./harptool -E -a rv32i --core ../runtime/mains/vector_test/vx_vector_main.hex -s -b 1> emulator.debug
|
# ./harptool -E -a rv32i --core ../runtime/mains/vector_test/vx_vector_main.hex -s -b 1> emulator.debug
|
||||||
|
|||||||
70
rtl/VX_alu.v
70
rtl/VX_alu.v
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
`include "VX_define.v"
|
`include "VX_define.v"
|
||||||
|
|
||||||
module VX_alu(
|
module VX_alu(
|
||||||
@@ -13,6 +12,71 @@ module VX_alu(
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
`ifdef SYN_FUNC
|
||||||
|
wire which_in2;
|
||||||
|
|
||||||
|
wire[31:0] ALU_in1;
|
||||||
|
wire[31:0] ALU_in2;
|
||||||
|
wire[63:0] ALU_in1_mult;
|
||||||
|
wire[63:0] ALU_in2_mult;
|
||||||
|
wire[31:0] upper_immed;
|
||||||
|
wire[31:0] div_result;
|
||||||
|
wire[31:0] rem_result;
|
||||||
|
|
||||||
|
|
||||||
|
assign which_in2 = in_rs2_src == `RS2_IMMED;
|
||||||
|
|
||||||
|
assign ALU_in1 = in_1;
|
||||||
|
|
||||||
|
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
|
||||||
|
|
||||||
|
|
||||||
|
assign upper_immed = {in_upper_immed, {12{1'b0}}};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//always @(posedge `MUL) begin
|
||||||
|
|
||||||
|
|
||||||
|
/* verilator lint_off UNUSED */
|
||||||
|
|
||||||
|
|
||||||
|
wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1};
|
||||||
|
wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2};
|
||||||
|
assign ALU_in1_mult = (in_alu_op == `MULHU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in1} : alu_in1_signed;
|
||||||
|
assign ALU_in2_mult = (in_alu_op == `MULHU || in_alu_op == `MULHSU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in2} : alu_in2_signed;
|
||||||
|
wire[63:0] mult_result = ALU_in1_mult * ALU_in2_mult;
|
||||||
|
|
||||||
|
/* verilator lint_on UNUSED */
|
||||||
|
|
||||||
|
always @(in_alu_op or ALU_in1 or ALU_in2) begin
|
||||||
|
case(in_alu_op)
|
||||||
|
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
|
||||||
|
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
|
||||||
|
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
|
||||||
|
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
|
||||||
|
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
|
||||||
|
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
|
||||||
|
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
|
||||||
|
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
|
||||||
|
`OR: out_alu_result = ALU_in1 | ALU_in2;
|
||||||
|
`AND: out_alu_result = ALU_in2 & ALU_in1;
|
||||||
|
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
|
||||||
|
`LUI_ALU: out_alu_result = upper_immed;
|
||||||
|
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
|
||||||
|
`MUL: out_alu_result = mult_result[31:0];
|
||||||
|
`MULH: out_alu_result = mult_result[63:32];
|
||||||
|
`MULHSU: out_alu_result = mult_result[63:32];
|
||||||
|
`MULHU: out_alu_result = mult_result[63:32];
|
||||||
|
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : $signed($signed(ALU_in1) / $signed(ALU_in2));
|
||||||
|
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : ALU_in1 / ALU_in2;
|
||||||
|
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : $signed($signed(ALU_in1) % $signed(ALU_in2));
|
||||||
|
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2;
|
||||||
|
default: out_alu_result = 32'h0;
|
||||||
|
endcase // in_alu_op
|
||||||
|
end
|
||||||
|
|
||||||
|
`else
|
||||||
wire which_in2;
|
wire which_in2;
|
||||||
|
|
||||||
wire[31:0] ALU_in1;
|
wire[31:0] ALU_in1;
|
||||||
@@ -69,7 +133,7 @@ module VX_alu(
|
|||||||
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2;
|
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2;
|
||||||
default: out_alu_result = 32'h0;
|
default: out_alu_result = 32'h0;
|
||||||
endcase // in_alu_op
|
endcase // in_alu_op
|
||||||
end
|
end
|
||||||
|
`endif
|
||||||
|
|
||||||
endmodule // VX_alu
|
endmodule // VX_alu
|
||||||
@@ -1,18 +1,18 @@
|
|||||||
|
`include "./VX_define_synth.v"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
`define NT 4
|
|
||||||
`define NT_M1 (`NT-1)
|
`define NT_M1 (`NT-1)
|
||||||
|
|
||||||
// NW_M1 is actually log2(NW)
|
// NW_M1 is actually log2(NW)
|
||||||
//`define NW_M1 (4-1)
|
|
||||||
|
|
||||||
`define NW 8
|
|
||||||
`define NW_M1 (`CLOG2(`NW))
|
`define NW_M1 (`CLOG2(`NW))
|
||||||
|
|
||||||
// Uncomment the below line if NW=1
|
// Uncomment the below line if NW=1
|
||||||
// `define ONLY
|
// `define ONLY
|
||||||
|
|
||||||
// `define SYN 1
|
// `define SYN 1
|
||||||
//`define ASIC 1
|
// `define ASIC 1
|
||||||
|
`define SYN_FUNC 1
|
||||||
|
|
||||||
`define NUM_BARRIERS 4
|
`define NUM_BARRIERS 4
|
||||||
|
|
||||||
|
|||||||
2
rtl/VX_define_synth.v
Normal file
2
rtl/VX_define_synth.v
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
`define NT 4
|
||||||
|
`define NW 8
|
||||||
156
rtl/VX_gpr.v
156
rtl/VX_gpr.v
@@ -85,83 +85,87 @@ module VX_gpr (
|
|||||||
|
|
||||||
wire[`NT_M1:0][31:0] to_write = (VX_writeback_inter.rd != 0) ? VX_writeback_inter.write_data : 0;
|
wire[`NT_M1:0][31:0] to_write = (VX_writeback_inter.rd != 0) ? VX_writeback_inter.write_data : 0;
|
||||||
|
|
||||||
/* verilator lint_off PINCONNECTEMPTY */
|
genvar curr_base_thread;
|
||||||
rf2_32x128_wm1 first_ram (
|
for (curr_base_thread = 0; curr_base_thread < 'NT; curr_base_thread=curr_base_thread+4)
|
||||||
.CENYA(),
|
begin
|
||||||
.AYA(),
|
/* verilator lint_off PINCONNECTEMPTY */
|
||||||
.CENYB(),
|
rf2_32x128_wm1 first_ram (
|
||||||
.WENYB(),
|
.CENYA(),
|
||||||
.AYB(),
|
.AYA(),
|
||||||
.QA(temp_a),
|
.CENYB(),
|
||||||
.SOA(),
|
.WENYB(),
|
||||||
.SOB(),
|
.AYB(),
|
||||||
.CLKA(clk),
|
.QA(temp_a[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.CENA(cena_1),
|
.SOA(),
|
||||||
.AA(VX_gpr_read.rs1),
|
.SOB(),
|
||||||
.CLKB(clk),
|
.CLKA(clk),
|
||||||
.CENB(cenb),
|
.CENA(cena_1),
|
||||||
.WENB(write_bit_mask),
|
.AA(VX_gpr_read.rs1[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.AB(VX_writeback_inter.rd),
|
.CLKB(clk),
|
||||||
.DB(to_write),
|
.CENB(cenb),
|
||||||
.EMAA(3'b011),
|
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.EMASA(1'b0),
|
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.EMAB(3'b011),
|
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.TENA(1'b1),
|
.EMAA(3'b011),
|
||||||
.TCENA(1'b0),
|
.EMASA(1'b0),
|
||||||
.TAA(5'b0),
|
.EMAB(3'b011),
|
||||||
.TENB(1'b1),
|
.TENA(1'b1),
|
||||||
.TCENB(1'b0),
|
.TCENA(1'b0),
|
||||||
.TWENB(128'b0),
|
.TAA(5'b0),
|
||||||
.TAB(5'b0),
|
.TENB(1'b1),
|
||||||
.TDB(128'b0),
|
.TCENB(1'b0),
|
||||||
.RET1N(1'b1),
|
.TWENB(128'b0),
|
||||||
.SIA(2'b0),
|
.TAB(5'b0),
|
||||||
.SEA(1'b0),
|
.TDB(128'b0),
|
||||||
.DFTRAMBYP(1'b0),
|
.RET1N(1'b1),
|
||||||
.SIB(2'b0),
|
.SIA(2'b0),
|
||||||
.SEB(1'b0),
|
.SEA(1'b0),
|
||||||
.COLLDISN(1'b1)
|
.DFTRAMBYP(1'b0),
|
||||||
);
|
.SIB(2'b0),
|
||||||
/* verilator lint_on PINCONNECTEMPTY */
|
.SEB(1'b0),
|
||||||
|
.COLLDISN(1'b1)
|
||||||
|
);
|
||||||
|
/* verilator lint_on PINCONNECTEMPTY */
|
||||||
|
|
||||||
/* verilator lint_off PINCONNECTEMPTY */
|
/* verilator lint_off PINCONNECTEMPTY */
|
||||||
rf2_32x128_wm1 second_ram (
|
rf2_32x128_wm1 second_ram (
|
||||||
.CENYA(),
|
.CENYA(),
|
||||||
.AYA(),
|
.AYA(),
|
||||||
.CENYB(),
|
.CENYB(),
|
||||||
.WENYB(),
|
.WENYB(),
|
||||||
.AYB(),
|
.AYB(),
|
||||||
.QA(temp_b),
|
.QA(temp_b[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.SOA(),
|
.SOA(),
|
||||||
.SOB(),
|
.SOB(),
|
||||||
.CLKA(clk),
|
.CLKA(clk),
|
||||||
.CENA(cena_2),
|
.CENA(cena_2),
|
||||||
.AA(VX_gpr_read.rs2),
|
.AA(VX_gpr_read.rs2[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.CLKB(clk),
|
.CLKB(clk),
|
||||||
.CENB(cenb),
|
.CENB(cenb),
|
||||||
.WENB(write_bit_mask),
|
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.AB(VX_writeback_inter.rd),
|
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.DB(to_write),
|
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
|
||||||
.EMAA(3'b011),
|
.EMAA(3'b011),
|
||||||
.EMASA(1'b0),
|
.EMASA(1'b0),
|
||||||
.EMAB(3'b011),
|
.EMAB(3'b011),
|
||||||
.TENA(1'b1),
|
.TENA(1'b1),
|
||||||
.TCENA(1'b0),
|
.TCENA(1'b0),
|
||||||
.TAA(5'b0),
|
.TAA(5'b0),
|
||||||
.TENB(1'b1),
|
.TENB(1'b1),
|
||||||
.TCENB(1'b0),
|
.TCENB(1'b0),
|
||||||
.TWENB(128'b0),
|
.TWENB(128'b0),
|
||||||
.TAB(5'b0),
|
.TAB(5'b0),
|
||||||
.TDB(128'b0),
|
.TDB(128'b0),
|
||||||
.RET1N(1'b1),
|
.RET1N(1'b1),
|
||||||
.SIA(2'b0),
|
.SIA(2'b0),
|
||||||
.SEA(1'b0),
|
.SEA(1'b0),
|
||||||
.DFTRAMBYP(1'b0),
|
.DFTRAMBYP(1'b0),
|
||||||
.SIB(2'b0),
|
.SIB(2'b0),
|
||||||
.SEB(1'b0),
|
.SEB(1'b0),
|
||||||
.COLLDISN(1'b1)
|
.COLLDISN(1'b1)
|
||||||
);
|
);
|
||||||
/* verilator lint_on PINCONNECTEMPTY */
|
/* verilator lint_on PINCONNECTEMPTY */
|
||||||
|
end
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -63,14 +63,40 @@ module VX_writeback (
|
|||||||
|
|
||||||
wire zero = 0;
|
wire zero = 0;
|
||||||
|
|
||||||
|
wire[`NT-1:0][31:0] use_wb_data;
|
||||||
|
|
||||||
|
reg prev_is_mem;
|
||||||
|
|
||||||
|
always @(posedge clk, posedge reset) begin
|
||||||
|
if (reset)
|
||||||
|
begin
|
||||||
|
prev_is_mem = 0;
|
||||||
|
end begin
|
||||||
|
prev_is_mem = mem_wb && !no_slot_mem;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
VX_generic_register #(.N(39 + `NW_M1 + 1 + `NT*33)) wb_register(
|
VX_generic_register #(.N(39 + `NW_M1 + 1 + `NT*33)) wb_register(
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset(reset),
|
.reset(reset),
|
||||||
.stall(zero),
|
.stall(zero),
|
||||||
.flush(zero),
|
.flush(zero),
|
||||||
.in ({VX_writeback_tempp.write_data, VX_writeback_tempp.wb_valid, VX_writeback_tempp.rd, VX_writeback_tempp.wb, VX_writeback_tempp.wb_warp_num, VX_writeback_tempp.wb_pc}),
|
.in ({VX_writeback_tempp.write_data, VX_writeback_tempp.wb_valid, VX_writeback_tempp.rd, VX_writeback_tempp.wb, VX_writeback_tempp.wb_warp_num, VX_writeback_tempp.wb_pc}),
|
||||||
.out ({VX_writeback_inter.write_data, VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc})
|
.out ({use_wb_data , VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc})
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`ifdef SYN
|
||||||
|
assign VX_writeback_inter.write_data = prev_is_mem ? VX_writeback_tempp.write_data : use_wb_data;
|
||||||
|
`else
|
||||||
|
assign VX_writeback_inter.write_data = use_wb_data;
|
||||||
|
`endif
|
||||||
|
|
||||||
|
|
||||||
|
endmodule // VX_writeback
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
endmodule // VX_writeback
|
|
||||||
8
rtl/cache/VX_d_cache.v
vendored
8
rtl/cache/VX_d_cache.v
vendored
@@ -304,9 +304,15 @@ module VX_d_cache
|
|||||||
// 0;
|
// 0;
|
||||||
|
|
||||||
wire[1:0] byte_select = bank_addr[1:0];
|
wire[1:0] byte_select = bank_addr[1:0];
|
||||||
|
wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START];
|
||||||
|
|
||||||
|
`ifdef SYN_FUNC
|
||||||
|
wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = 0;
|
||||||
|
wire[IND_SIZE_END:IND_SIZE_START] cache_index = 0;
|
||||||
|
`else
|
||||||
wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = bank_addr[ADDR_OFFSET_END:ADDR_OFFSET_START];
|
wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = bank_addr[ADDR_OFFSET_END:ADDR_OFFSET_START];
|
||||||
wire[IND_SIZE_END:IND_SIZE_START] cache_index = bank_addr[ADDR_IND_END:ADDR_IND_START];
|
wire[IND_SIZE_END:IND_SIZE_START] cache_index = bank_addr[ADDR_IND_END:ADDR_IND_START];
|
||||||
wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START];
|
`endif
|
||||||
|
|
||||||
|
|
||||||
wire normal_valid_in = valid_per_bank[bank_id];
|
wire normal_valid_in = valid_per_bank[bank_id];
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ SRC = \
|
|||||||
vortex_dpi.cpp \
|
vortex_dpi.cpp \
|
||||||
vortex_tb.v \
|
vortex_tb.v \
|
||||||
../VX_define.v \
|
../VX_define.v \
|
||||||
|
../VX_define_synth.v \
|
||||||
../interfaces/VX_branch_response_inter.v \
|
../interfaces/VX_branch_response_inter.v \
|
||||||
../interfaces/VX_csr_req_inter.v \
|
../interfaces/VX_csr_req_inter.v \
|
||||||
../interfaces/VX_csr_wb_inter.v \
|
../interfaces/VX_csr_wb_inter.v \
|
||||||
@@ -78,7 +79,9 @@ SRC = \
|
|||||||
../../models/memory/cln28hpm/rf2_128x128_wm1/rf2_128x128_wm1.v \
|
../../models/memory/cln28hpm/rf2_128x128_wm1/rf2_128x128_wm1.v \
|
||||||
../../models/memory/cln28hpm/rf2_256x128_wm1/rf2_256x128_wm1.v \
|
../../models/memory/cln28hpm/rf2_256x128_wm1/rf2_256x128_wm1.v \
|
||||||
../../models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0.v \
|
../../models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0.v \
|
||||||
../../models/memory/cln28hpm/rf2_32x128_wm1/rf2_32x128_wm1.v
|
../../models/memory/cln28hpm/rf2_32x128_wm1/rf2_32x128_wm1.v \
|
||||||
|
../../models/memory/cln28hpm/rf2_32x19_wm0/rf2_32x19_wm0.v
|
||||||
|
|
||||||
# ../../models/memory/cln28hpc/rf2_32x128_wm1/rf2_32x128_wm1.v
|
# ../../models/memory/cln28hpc/rf2_32x128_wm1/rf2_32x128_wm1.v
|
||||||
|
|
||||||
# vortex_dpi.h
|
# vortex_dpi.h
|
||||||
|
|||||||
20
runtime/mains/vecadd/vecadd.cl
Normal file
20
runtime/mains/vecadd/vecadd.cl
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
#include "../../intrinsics/vx_intrinsics.h"
|
||||||
|
|
||||||
|
kernel void
|
||||||
|
vecadd (__global const int *a,
|
||||||
|
__global const int *b,
|
||||||
|
__global int *c)
|
||||||
|
{
|
||||||
|
int gid = get_global_id(0);
|
||||||
|
|
||||||
|
__if (gid < 2)
|
||||||
|
{
|
||||||
|
c[gid] = a[gid] + b[gid];
|
||||||
|
}
|
||||||
|
__else
|
||||||
|
{
|
||||||
|
c[gid] = b[gid] - a[gid];
|
||||||
|
}
|
||||||
|
__endif
|
||||||
|
}
|
||||||
@@ -5,13 +5,13 @@
|
|||||||
// sftp, ftp or ftps
|
// sftp, ftp or ftps
|
||||||
"type": "sftp",
|
"type": "sftp",
|
||||||
|
|
||||||
"save_before_upload": true,
|
"save_before_upload": false,
|
||||||
"upload_on_save": true,
|
"upload_on_save": false,
|
||||||
"sync_down_on_open": false,
|
"sync_down_on_open": false,
|
||||||
"sync_skip_deletes": false,
|
"sync_skip_deletes": false,
|
||||||
"sync_same_age": true,
|
"sync_same_age": false,
|
||||||
"confirm_downloads": false,
|
"confirm_downloads": false,
|
||||||
"confirm_sync": true,
|
"confirm_sync": false,
|
||||||
"confirm_overwrite_newer": false,
|
"confirm_overwrite_newer": false,
|
||||||
|
|
||||||
"host": "ece-rschsrv01.ece.gatech.edu",
|
"host": "ece-rschsrv01.ece.gatech.edu",
|
||||||
|
|||||||
@@ -405,11 +405,11 @@ void Core::fetch()
|
|||||||
printTrace(&inst_in_fetch, "Fetch");
|
printTrace(&inst_in_fetch, "Fetch");
|
||||||
|
|
||||||
// #ifdef PRINT_ACTIVE_THREADS
|
// #ifdef PRINT_ACTIVE_THREADS
|
||||||
// for (unsigned j = 0; j < w[schedule_w].tmask.size(); ++j) {
|
for (unsigned j = 0; j < w[schedule_w].tmask.size(); ++j) {
|
||||||
// if (w[schedule_w].activeThreads > j && w[schedule_w].tmask[j]) cout << " 1";
|
if (w[schedule_w].activeThreads > j && w[schedule_w].tmask[j]) cout << " 1";
|
||||||
// else cout << " 0";
|
else cout << " 0";
|
||||||
// if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) cout << ',';
|
if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) cout << ',';
|
||||||
// }
|
}
|
||||||
// #endif
|
// #endif
|
||||||
|
|
||||||
|
|
||||||
@@ -430,7 +430,7 @@ void Core::decode()
|
|||||||
INIT_TRACE(inst_in_fetch);
|
INIT_TRACE(inst_in_fetch);
|
||||||
}
|
}
|
||||||
|
|
||||||
printTrace(&inst_in_decode, "Decode");
|
//printTrace(&inst_in_decode, "Decode");
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::scheduler()
|
void Core::scheduler()
|
||||||
@@ -442,7 +442,7 @@ void Core::scheduler()
|
|||||||
INIT_TRACE(inst_in_decode);
|
INIT_TRACE(inst_in_decode);
|
||||||
}
|
}
|
||||||
|
|
||||||
printTrace(&inst_in_scheduler, "scheduler");
|
//printTrace(&inst_in_scheduler, "scheduler");
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::load_store()
|
void Core::load_store()
|
||||||
@@ -496,7 +496,7 @@ void Core::load_store()
|
|||||||
|
|
||||||
if (inst_in_lsu.mem_stall_cycles > 0) inst_in_lsu.mem_stall_cycles--;
|
if (inst_in_lsu.mem_stall_cycles > 0) inst_in_lsu.mem_stall_cycles--;
|
||||||
|
|
||||||
printTrace(&inst_in_lsu, "LSU");
|
//printTrace(&inst_in_lsu, "LSU");
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::execute_unit()
|
void Core::execute_unit()
|
||||||
@@ -548,7 +548,7 @@ void Core::execute_unit()
|
|||||||
|
|
||||||
// }
|
// }
|
||||||
|
|
||||||
printTrace(&inst_in_exe, "execute_unit");
|
//printTrace(&inst_in_exe, "execute_unit");
|
||||||
// INIT_TRACE(inst_in_exe);
|
// INIT_TRACE(inst_in_exe);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -604,7 +604,7 @@ void Core::writeback()
|
|||||||
|
|
||||||
// if (!serviced_exe && !serviced_mem) INIT_TRACE(inst_in_wb);
|
// if (!serviced_exe && !serviced_mem) INIT_TRACE(inst_in_wb);
|
||||||
|
|
||||||
printTrace(&inst_in_wb, "Writeback");
|
//printTrace(&inst_in_wb, "Writeback");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -712,12 +712,12 @@ void Warp::step(trace_inst_t * trace_inst) {
|
|||||||
|
|
||||||
|
|
||||||
// At Debug Level 3, print debug info after each instruction.
|
// At Debug Level 3, print debug info after each instruction.
|
||||||
#ifdef USE_DEBUG
|
// #ifdef USE_DEBUG
|
||||||
if (USE_DEBUG >= 3) {
|
// if (USE_DEBUG >= 3) {
|
||||||
D(3, "Register state:");
|
D(3, "Register state:");
|
||||||
for (unsigned i = 0; i < reg[0].size(); ++i) {
|
for (unsigned i = 0; i < reg[0].size(); ++i) {
|
||||||
D_RAW(" %r" << setfill(' ') << setw(2) << dec << i << ':');
|
D_RAW(" %r" << setfill(' ') << setw(2) << dec << i << ':');
|
||||||
for (unsigned j = 0; j < reg.size(); ++j)
|
for (unsigned j = 0; j < (this->activeThreads); ++j)
|
||||||
D_RAW(' ' << setfill('0') << setw(8) << hex << reg[j][i] << setfill(' ') << ' ');
|
D_RAW(' ' << setfill('0') << setw(8) << hex << reg[j][i] << setfill(' ') << ' ');
|
||||||
D_RAW('(' << shadowReg[i] << ')' << endl);
|
D_RAW('(' << shadowReg[i] << ')' << endl);
|
||||||
}
|
}
|
||||||
@@ -729,8 +729,8 @@ void Warp::step(trace_inst_t * trace_inst) {
|
|||||||
D_RAW(endl);
|
D_RAW(endl);
|
||||||
D_RAW(endl);
|
D_RAW(endl);
|
||||||
D_RAW(endl);
|
D_RAW(endl);
|
||||||
}
|
// }
|
||||||
#endif
|
// #endif
|
||||||
|
|
||||||
// Clean up.
|
// Clean up.
|
||||||
delete inst;
|
delete inst;
|
||||||
|
|||||||
@@ -219,7 +219,13 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx, trace_in
|
|||||||
|
|
||||||
imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
|
imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
|
||||||
|
|
||||||
inst.setSrcImm(signExt(imeed, 20, j_immed_mask));
|
if (bit_20)
|
||||||
|
{
|
||||||
|
imeed |= ~j_immed_mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
// inst.setSrcImm(signExt(imeed, 20, j_immed_mask));
|
||||||
|
inst.setSrcImm(imeed);
|
||||||
usedImm = true;
|
usedImm = true;
|
||||||
|
|
||||||
trace_inst->valid_inst = true;
|
trace_inst->valid_inst = true;
|
||||||
|
|||||||
@@ -1508,6 +1508,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
case 2:
|
case 2:
|
||||||
{
|
{
|
||||||
Word VLMAX = (c.vtype.vlmul * c.VLEN)/c.vtype.vsew;
|
Word VLMAX = (c.vtype.vlmul * c.VLEN)/c.vtype.vsew;
|
||||||
|
|
||||||
switch(func6){
|
switch(func6){
|
||||||
case 24: //vmandnot
|
case 24: //vmandnot
|
||||||
{
|
{
|
||||||
@@ -1532,6 +1533,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 16) {
|
} else if(c.vtype.vsew == 16) {
|
||||||
for(uint16_t i = 0; i < c.vl; i++){
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
@@ -1549,6 +1551,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 32) {
|
} else if(c.vtype.vsew == 32) {
|
||||||
for(uint32_t i = 0; i < c.vl; i++){
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
@@ -1565,6 +1568,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -1603,6 +1607,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint16_t * result_ptr = (uint16_t *) vd[i].val;
|
uint16_t * result_ptr = (uint16_t *) vd[i].val;
|
||||||
*result_ptr = result;
|
*result_ptr = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(uint16_t i = c.vl; i < VLMAX; i++){
|
for(uint16_t i = c.vl; i < VLMAX; i++){
|
||||||
uint16_t *result_ptr = (uint16_t *) vd[i].val;
|
uint16_t *result_ptr = (uint16_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
@@ -1620,6 +1625,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint32_t * result_ptr = (uint32_t *) vd[i].val;
|
uint32_t * result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = result;
|
*result_ptr = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(Word i = c.vl; i < VLMAX; i++){
|
for(Word i = c.vl; i < VLMAX; i++){
|
||||||
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
@@ -1667,7 +1673,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
result_ptr = (uint16_t *) vd[i].val;
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 32) {
|
} else if(c.vtype.vsew == 32) {
|
||||||
uint32_t *result_ptr;
|
uint32_t *result_ptr;
|
||||||
for(uint32_t i = 0; i < c.vl; i++){
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
@@ -1692,11 +1697,11 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
case 27: //vmxor
|
case 27: //vmxor
|
||||||
{
|
{
|
||||||
D(3, "vmxor");
|
D(3, "vmxor");
|
||||||
uint8_t *result_ptr;
|
|
||||||
vector<Reg<char *>> vr1 = c.vreg[rsrc[0]];
|
vector<Reg<char *>> vr1 = c.vreg[rsrc[0]];
|
||||||
vector<Reg<char *>> vr2 = c.vreg[rsrc[1]];
|
vector<Reg<char *>> vr2 = c.vreg[rsrc[1]];
|
||||||
vector<Reg<char *>> vd = c.vreg[rdest];
|
vector<Reg<char *>> vd = c.vreg[rdest];
|
||||||
if(c.vtype.vsew == 8){
|
if(c.vtype.vsew == 8){
|
||||||
|
uint8_t *result_ptr;
|
||||||
for(uint8_t i = 0; i < c.vl; i++){
|
for(uint8_t i = 0; i < c.vl; i++){
|
||||||
uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
||||||
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
||||||
@@ -1704,7 +1709,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint8_t second_value = (*second_ptr & 0x1);
|
uint8_t second_value = (*second_ptr & 0x1);
|
||||||
uint8_t result = (first_value ^ second_value);
|
uint8_t result = (first_value ^ second_value);
|
||||||
cout << "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result << '\n';
|
cout << "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result << '\n';
|
||||||
|
|
||||||
result_ptr = (uint8_t *) vd[i].val;
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
*result_ptr = result;
|
*result_ptr = result;
|
||||||
}
|
}
|
||||||
@@ -1712,7 +1716,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
result_ptr = (uint8_t *) vd[i].val;
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 16) {
|
} else if(c.vtype.vsew == 16) {
|
||||||
uint16_t *result_ptr;
|
uint16_t *result_ptr;
|
||||||
for(uint16_t i = 0; i < c.vl; i++){
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
@@ -1733,6 +1736,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
|
|
||||||
} else if(c.vtype.vsew == 32) {
|
} else if(c.vtype.vsew == 32) {
|
||||||
uint32_t *result_ptr;
|
uint32_t *result_ptr;
|
||||||
|
|
||||||
for(uint32_t i = 0; i < c.vl; i++){
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
||||||
@@ -1773,7 +1777,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint8_t *result_ptr = (uint8_t *) vd[i].val;
|
uint8_t *result_ptr = (uint8_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 16) {
|
} else if(c.vtype.vsew == 16) {
|
||||||
for(uint16_t i = 0; i < c.vl; i++){
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
@@ -1845,6 +1848,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint16_t * result_ptr = (uint16_t *) vd[i].val;
|
uint16_t * result_ptr = (uint16_t *) vd[i].val;
|
||||||
*result_ptr = result;
|
*result_ptr = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(uint16_t i = c.vl; i < VLMAX; i++){
|
for(uint16_t i = c.vl; i < VLMAX; i++){
|
||||||
uint16_t *result_ptr = (uint16_t *) vd[i].val;
|
uint16_t *result_ptr = (uint16_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
@@ -1862,10 +1866,12 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint32_t * result_ptr = (uint32_t *) vd[i].val;
|
uint32_t * result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = result;
|
*result_ptr = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(Word i = c.vl; i < VLMAX; i++){
|
for(Word i = c.vl; i < VLMAX; i++){
|
||||||
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -1877,6 +1883,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
vector<Reg<char *>> vd = c.vreg[rdest];
|
vector<Reg<char *>> vd = c.vreg[rdest];
|
||||||
if(c.vtype.vsew == 8){
|
if(c.vtype.vsew == 8){
|
||||||
uint8_t *result_ptr;
|
uint8_t *result_ptr;
|
||||||
|
|
||||||
for(uint8_t i = 0; i < c.vl; i++){
|
for(uint8_t i = 0; i < c.vl; i++){
|
||||||
uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
||||||
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
||||||
@@ -1892,7 +1899,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
result_ptr = (uint8_t *) vd[i].val;
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if(c.vtype.vsew == 16) {
|
} else if(c.vtype.vsew == 16) {
|
||||||
for(uint16_t i = 0; i < c.vl; i++){
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
@@ -1927,6 +1933,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
uint32_t *result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -1934,6 +1941,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
{
|
{
|
||||||
D(3, "vmxnor");
|
D(3, "vmxnor");
|
||||||
uint8_t *result_ptr;
|
uint8_t *result_ptr;
|
||||||
|
|
||||||
vector<Reg<char *>> vr1 = c.vreg[rsrc[0]];
|
vector<Reg<char *>> vr1 = c.vreg[rsrc[0]];
|
||||||
vector<Reg<char *>> vr2 = c.vreg[rsrc[1]];
|
vector<Reg<char *>> vr2 = c.vreg[rsrc[1]];
|
||||||
vector<Reg<char *>> vd = c.vreg[rdest];
|
vector<Reg<char *>> vd = c.vreg[rdest];
|
||||||
@@ -1953,7 +1961,8 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
result_ptr = (uint8_t *) vd[i].val;
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
} else if(c.vtype.vsew == 16) {
|
}
|
||||||
|
else if(c.vtype.vsew == 16) {
|
||||||
uint16_t *result_ptr;
|
uint16_t *result_ptr;
|
||||||
for(uint16_t i = 0; i < c.vl; i++){
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
@@ -1973,6 +1982,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
|
|
||||||
} else if(c.vtype.vsew == 32) {
|
} else if(c.vtype.vsew == 32) {
|
||||||
uint32_t *result_ptr;
|
uint32_t *result_ptr;
|
||||||
|
|
||||||
for(uint32_t i = 0; i < c.vl; i++){
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
||||||
@@ -1988,6 +1998,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
result_ptr = (uint32_t *) vd[i].val;
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
*result_ptr = 0;
|
*result_ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|||||||
32
syn/Makefile
32
syn/Makefile
@@ -1,7 +1,33 @@
|
|||||||
|
|
||||||
|
|
||||||
all: syn
|
SCRIPT_DIR=./scripts
|
||||||
|
|
||||||
|
all: dc
|
||||||
|
|
||||||
|
|
||||||
syn:
|
#syn:
|
||||||
dc_shell-t -f fsyn.tcl 2>&1 | tee vortex_syn.log
|
#dc_shell-t -f esyn.tcl 2>&1 | tee vortex_syn.log
|
||||||
|
#dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log
|
||||||
|
#dc_shell -f $(SCRIPT_DIR)/dc/dc_script.tcl
|
||||||
|
|
||||||
|
dc:
|
||||||
|
rm -rf rpt
|
||||||
|
mkdir rpt
|
||||||
|
dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f simv
|
||||||
|
rm -f *.vcd
|
||||||
|
rm -f *.key
|
||||||
|
rm -rf csrc/
|
||||||
|
rm -rf *.rpt
|
||||||
|
rm -rf *.log
|
||||||
|
rm -rf *.svf
|
||||||
|
rm -rf *.ddc
|
||||||
|
rm -rf results_synthesized.v
|
||||||
|
rm -rf results_synthesized.sdc
|
||||||
|
rm -rf alib-52/
|
||||||
|
rm -rf rpt/
|
||||||
|
rm -rf simv.daidir/
|
||||||
|
rm -rf encounter*
|
||||||
|
rm -rf ./synth_out
|
||||||
53
syn/esyn.tcl
Normal file
53
syn/esyn.tcl
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#set search_path [concat /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_db/ /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_hard_db/ ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db]
|
||||||
|
set search_path [concat ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db]
|
||||||
|
set link_library [concat ./NanGate_15nm_OCL.db]
|
||||||
|
set symbol_library {}
|
||||||
|
set target_library [concat ./NanGate_15nm_OCL.db]
|
||||||
|
|
||||||
|
set verilog_files [ list VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v VX_cache_data_per_index.v VX_Cache_Bank.v VX_cache_data.v VX_d_cache.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_define_synth.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v Vortex.v VX_cache_bank_valid.v \
|
||||||
|
]
|
||||||
|
# set verilog_files [ list Vortex.v VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v cache_set.v VX_Cache_Bank.v VX_Cache_Block_DM.v VX_cache_data.v VX_d_cache.v VX_generic_pc.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
|
||||||
|
# ]
|
||||||
|
|
||||||
|
set top_level Vortex
|
||||||
|
analyze -format sverilog $verilog_files
|
||||||
|
#analyze -format sverilog -error=LINT-66 $verilog_files
|
||||||
|
elaborate Vortex
|
||||||
|
link
|
||||||
|
|
||||||
|
set clk_freq 0.4
|
||||||
|
set clk_period [expr 1000.0 / $clk_freq / 1.0]
|
||||||
|
create_clock [get_ports clk] -period $clk_period
|
||||||
|
set_max_fanout 20 [get_ports clk]
|
||||||
|
set_ideal_network [get_ports clk]
|
||||||
|
|
||||||
|
set_max_fanout 20 [get_ports reset]
|
||||||
|
set_false_path -from [get_ports reset]
|
||||||
|
all_high_fanout -net -threshold 20
|
||||||
|
|
||||||
|
# set_register_merging Vortex FALSE
|
||||||
|
# set compile_seqmap_propagate_constants false
|
||||||
|
# set compile_seqmap_propagate_high_effort false
|
||||||
|
|
||||||
|
check_design
|
||||||
|
compile_ultra -no_autoungroup
|
||||||
|
ungroup -all -flatten
|
||||||
|
uniquify
|
||||||
|
|
||||||
|
define_name_rules verilog -remove_internal_net_bus -remove_port_bus
|
||||||
|
change_names -rule verilog -hierarchy
|
||||||
|
|
||||||
|
# report_qor
|
||||||
|
report_area
|
||||||
|
report_hierarchy
|
||||||
|
report_cell
|
||||||
|
report_reference
|
||||||
|
report_port
|
||||||
|
report_power
|
||||||
|
|
||||||
|
write -hierarchy -format verilog -output Vortex.netlist.v
|
||||||
|
remove_ideal_network [get_ports clk]
|
||||||
|
set_propagated_clock [get_ports clk]
|
||||||
|
write_sdc -version 1.9 Vortex.sdc
|
||||||
|
write_file -format ddc -output Vortex.ddc
|
||||||
|
exit
|
||||||
28
syn/run_mult_synth.sh
Normal file
28
syn/run_mult_synth.sh
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set top_level = Vortex
|
||||||
|
|
||||||
|
source /tools/synopsys/synthesis/j201409/cshrc.syn
|
||||||
|
set cur_dir = `pwd`
|
||||||
|
echo $cur_dir
|
||||||
|
|
||||||
|
for number_of_warps in 2 4 8 16 32; do
|
||||||
|
for number_of_threads in 2 4 8 16 32; do
|
||||||
|
|
||||||
|
echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Launched"
|
||||||
|
echo "\`define NT $number_of_threads" > ../rtl/VX_define_synth.v
|
||||||
|
echo "\`define NW $number_of_warps" >> ../rtl/VX_define_synth.v
|
||||||
|
make dc | tee run.log 1>/dev/null
|
||||||
|
sleep 30
|
||||||
|
moved_filename="${number_of_warps}_Warps__${number_of_threads}_threads__400MHz.log"
|
||||||
|
mv ./vortex_syn.log ../../$moved_filename
|
||||||
|
sleep 30
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Finished"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
echo "Done!"
|
||||||
Reference in New Issue
Block a user