diff --git a/benchmarks/opencl/bfs/CLHelper.h b/benchmarks/opencl/bfs/CLHelper.h new file mode 100755 index 00000000..b9a873e4 --- /dev/null +++ b/benchmarks/opencl/bfs/CLHelper.h @@ -0,0 +1,814 @@ +//------------------------------------------ +//--cambine:helper function for OpenCL +//--programmer: Jianbin Fang +//--date: 27/12/2010 +//------------------------------------------ +#ifndef _CL_HELPER_ +#define _CL_HELPER_ + +#include +#include +#include +#include +#include + +using std::string; +using std::ifstream; +using std::cerr; +using std::endl; +using std::cout; +//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable +#define WORK_DIM 2 //work-items dimensions + +struct oclHandleStruct +{ + cl_context context; + cl_device_id *devices; + cl_command_queue queue; + cl_program program; + cl_int cl_status; + std::string error_str; + std::vector kernel; +}; + +struct oclHandleStruct oclHandles; + +char kernel_file[100] = "Kernels.cl"; +int total_kernels = 2; +string kernel_names[2] = {"BFS_1", "BFS_2"}; +int work_group_size = 512; +int device_id_inused = 0; //deviced id used (default : 0) + +/* + * Converts the contents of a file into a string + */ +string FileToString(const string fileName) +{ + ifstream f(fileName.c_str(), ifstream::in | ifstream::binary); + + try + { + size_t size; + char* str; + string s; + + if(f.is_open()) + { + size_t fileSize; + f.seekg(0, ifstream::end); + size = fileSize = f.tellg(); + f.seekg(0, ifstream::beg); + + str = new char[size+1]; + if (!str) throw(string("Could not allocate memory")); + + f.read(str, fileSize); + f.close(); + str[size] = '\0'; + + s = str; + delete [] str; + return s; + } + } + catch(std::string msg) + { + cerr << "Exception caught in FileToString(): " << msg << endl; + if(f.is_open()) + f.close(); + } + catch(...) + { + cerr << "Exception caught in FileToString()" << endl; + if(f.is_open()) + f.close(); + } + string errorMsg = "FileToString()::Error: Unable to open file " + + fileName; + throw(errorMsg); +} +//--------------------------------------- +//Read command line parameters +// +void _clCmdParams(int argc, char* argv[]){ + for (int i =0; i < argc; ++i) + { + switch (argv[i][1]) + { + case 'g': //--g stands for size of work group + if (++i < argc) + { + sscanf(argv[i], "%u", &work_group_size); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + case 'd': //--d stands for device id used in computaion + if (++i < argc) + { + sscanf(argv[i], "%u", &device_id_inused); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + default: + ; + } + } + +} + +//--------------------------------------- +//Initlize CL objects +//--description: there are 5 steps to initialize all the OpenCL objects needed +//--revised on 04/01/2011: get the number of devices and +// devices have no relationship with context +void _clInit() +{ + int DEVICE_ID_INUSED = device_id_inused; + cl_int resultCL; + + oclHandles.context = NULL; + oclHandles.devices = NULL; + oclHandles.queue = NULL; + oclHandles.program = NULL; + + cl_uint deviceListSize; + + //----------------------------------------------- + //--cambine-1: find the available platforms and select one + + cl_uint numPlatforms; + cl_platform_id targetPlatform = NULL; + + resultCL = clGetPlatformIDs(0, NULL, &numPlatforms); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)")); + //printf("number of platforms:%d\n",numPlatforms); //by cambine + + if (!(numPlatforms > 0)) + throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)")); + + cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); + + resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)")); + + /* Select the target platform. Default: first platform */ + targetPlatform = allPlatforms[0]; + for (int i = 0; i < numPlatforms; i++) + { + char pbuff[128]; + resultCL = clGetPlatformInfo( allPlatforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuff), + pbuff, + NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)")); + + //printf("vedor is %s\n",pbuff); + + } + free(allPlatforms); + + //----------------------------------------------- + //--cambine-2: create an OpenCL context + cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 }; + oclHandles.context = clCreateContextFromType(cprops, + CL_DEVICE_TYPE_GPU, + NULL, + NULL, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL)) + throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)")); + //----------------------------------------------- + //--cambine-3: detect OpenCL devices + /* First, get the size of device list */ + oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exception in _clInit -> clGetDeviceIDs")); + } + if (deviceListSize == 0) + throw(string("InitCL()::Error: No devices found.")); + + //std::cout<<"device number:"< clGetDeviceIDs-2")); + } + //----------------------------------------------- + //--cambine-4: Create an OpenCL command queue + oclHandles.queue = clCreateCommandQueue(oclHandles.context, + oclHandles.devices[DEVICE_ID_INUSED], + 0, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL)) + throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); + //----------------------------------------------- + //--cambine-5: Load CL file, build CL program object, create CL kernel object + std::string source_str = FileToString(kernel_file); + const char * source = source_str.c_str(); + size_t sourceSize[] = { source_str.length() }; + + oclHandles.program = clCreateProgramWithSource(oclHandles.context, + 1, + &source, + sourceSize, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)")); + //insert debug information + //std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines + //options += " -cl-nv-opt-level=3"; + resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + { + cerr << "InitCL()::Error: In clBuildProgram" << endl; + + size_t length; + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + 0, + NULL, + &length); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + char* buffer = (char*)malloc(length); + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + length, + buffer, + NULL); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + cerr << buffer << endl; + free(buffer); + + throw(string("InitCL()::Error: Building Program (clBuildProgram)")); + } + + //get program information in intermediate representation + #ifdef PTX_MSG + size_t binary_sizes[deviceListSize]; + char * binaries[deviceListSize]; + //figure out number of devices and the sizes of the binary for each device. + oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL ); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2")); + } + + std::cout<<"--cambine:"< clGetProgramInfo-3")); + } + for(int i=0;i getting resource information")); + } + + build_log = (char *)malloc(ret_val_size+1); + oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exceptions in _InitCL -> getting resources allocation information-2")); + } + build_log[ret_val_size] = '\0'; + std::cout<<"--cambine:"<d_mem_pinned")); + #endif + //------------ + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \ + size, NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> d_mem ")); + #endif + //---------- + h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \ + CL_MAP_WRITE, 0, size, 0, NULL, \ + NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer")); + #endif + int element_number = size/sizeof(float); + #pragma omp parallel for + for(int i=0;i clEnqueueWriteBuffer")); + #endif + + return d_mem; +} + + +//-------------------------------------------------------- +//--cambine:create write only buffer on device +cl_mem _clMallocWO(int size) throw(string){ + cl_mem d_mem; + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateMem()")); + #endif + return d_mem; +} + +//-------------------------------------------------------- +//transfer data from device to host +void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){ + oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clCpyMemD2H -> "; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_VALUE: + oclHandles.error_str += "CL_INVALID_VALUE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} + +//-------------------------------------------------------- +//set kernel arguments +void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){ + if(!size){ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } + else{ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } +} +void _clFinish() throw(string){ + oclHandles.cl_status = clFinish(oclHandles.queue); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFinish"; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reasons"; + break; + + } + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(oclHandles.error_str); + } + #endif +} +//-------------------------------------------------------- +//--cambine:enqueue kernel +void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){ + cl_uint work_dim = WORK_DIM; + cl_event e[1]; + if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size)); + size_t local_work_size[] = {work_group_size, 1}; + size_t global_work_size[] = {work_items, 1}; + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + // oclHandles.cl_status = clWaitForEvents(1, &e[0]); + // #ifdef ERRMSG + // if (oclHandles.cl_status!= CL_SUCCESS) + // throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + // #endif +} +void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){ + cl_uint work_dim = WORK_DIM; + size_t local_work_size[] = {group_x, group_y}; + size_t global_work_size[] = {range_x, range_y}; + cl_event e[1]; + /*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size));*/ + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + /*oclHandles.cl_status = clWaitForEvents(1, &e[0]); + + #ifdef ERRMSG + + if (oclHandles.cl_status!= CL_SUCCESS) + + throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + + #endif*/ +} + +//-------------------------------------------------------- +//release OpenCL objects +void _clFree(cl_mem ob) throw(string){ + if(ob!=NULL) + oclHandles.cl_status = clReleaseMemObject(ob); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFree() ->"; + switch(oclHandles.cl_status) + { + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if (oclHandles.cl_status!= CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} +#endif //_CL_HELPER_ diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index 5ad87059..360d3e86 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -29,12 +29,20 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio LIBS = -lOpenCL +<<<<<<< HEAD PROJECT=saxpy +======= +PROJECT=bfs +>>>>>>> f3700051a4da6cd017e5ce41f2732f3fc3e86e2d all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl +<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl +======= + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl +>>>>>>> f3700051a4da6cd017e5ce41f2732f3fc3e86e2d $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf diff --git a/benchmarks/opencl/bfs/kernel.cl b/benchmarks/opencl/bfs/kernel.cl new file mode 100755 index 00000000..51ce5a08 --- /dev/null +++ b/benchmarks/opencl/bfs/kernel.cl @@ -0,0 +1,53 @@ +/* ============================================================ +//--cambine: kernel funtion of Breadth-First-Search +//--author: created by Jianbin Fang +//--date: 06/12/2010 +============================================================ */ + +//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable + +//Structure to hold a node information +typedef struct{ + int starting; + int no_of_edges; +} Node; + +//--7 parameters +__kernel void BFS_1( const __global Node* g_graph_nodes, + const __global int* g_graph_edges, + __global char* g_graph_mask, + __global char* g_updating_graph_mask, + __global char* g_graph_visited, + __global int* g_cost, + const int no_of_nodes){ + int tid = get_global_id(0); + if( tid +#include +#include +#include + +#ifdef PROFILING +#include "timer.h" +#endif + +#include "CLHelper.h" +#include "util.h" + +#define MAX_THREADS_PER_BLOCK 256 + +//Structure to hold a node information +struct Node +{ + int starting; + int no_of_edges; +}; + + +//---------------------------------------------------------- +//--bfs on cpu +//--programmer: jianbin +//--date: 26/01/2011 +//--note: width is changed to the new_width +//---------------------------------------------------------- +void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost_ref){ + char stop; + int k = 0; + do{ + //if no thread changes this value then the loop stops + stop=false; + for(int tid = 0; tid < no_of_nodes; tid++ ) + { + if (h_graph_mask[tid] == true){ + h_graph_mask[tid]=false; + for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ + int id = h_graph_edges[i]; //--cambine: node id is connected with node tid + if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below + h_cost_ref[id]=h_cost_ref[tid]+1; + h_updating_graph_mask[id]=true; + } + } + } + } + + for(int tid=0; tid< no_of_nodes ; tid++ ) + { + if (h_updating_graph_mask[tid] == true){ + h_graph_mask[tid]=true; + h_graph_visited[tid]=true; + stop=true; + h_updating_graph_mask[tid]=false; + } + } + k++; + } + while(stop); +} +//---------------------------------------------------------- +//--breadth first search on GPUs +//---------------------------------------------------------- +void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost) + throw(std::string){ + + //int number_elements = height*width; + char h_over; + cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ + d_graph_visited, d_cost, d_over; + try{ + //--1 transfer data from host to device + _clInit(); + d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); + d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); + d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); + d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); + d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); + + + d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); + d_over = _clMallocRW(sizeof(char), &h_over); + + _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); + _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); + _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); + _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); + _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); + _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); + + //--2 invoke kernel +#ifdef PROFILING + timer kernel_timer; + double kernel_time = 0.0; + kernel_timer.reset(); + kernel_timer.start(); +#endif + do{ + h_over = false; + _clMemcpyH2D(d_over, sizeof(char), &h_over); + //--kernel 0 + int kernel_id = 0; + int kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); + _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_cost); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //int work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + //--kernel 1 + kernel_id = 1; + kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_over); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + _clMemcpyD2H(d_over,sizeof(char), &h_over); + }while(h_over); + + _clFinish(); +#ifdef PROFILING + kernel_timer.stop(); + kernel_time = kernel_timer.getTimeInSeconds(); +#endif + //--3 transfer data from device to host + _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); + //--statistics +#ifdef PROFILING + std::cout<<"kernel time(s):"<\n", argv[0]); + +} +//---------------------------------------------------------- +//--cambine: main function +//--author: created by Jianbin Fang +//--date: 25/01/2011 +//---------------------------------------------------------- +int main(int argc, char * argv[]) +{ + int no_of_nodes; + int edge_list_size; + FILE *fp; + Node* h_graph_nodes; + char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; + try{ + char *input_f; + if(argc!=2){ + Usage(argc, argv); + exit(0); + } + + input_f = argv[1]; + printf("Reading File\n"); + //Read in Graph from a file + fp = fopen(input_f,"r"); + if(!fp){ + printf("Error Reading graph file\n"); + return 0; + } + + int source = 0; + + fscanf(fp,"%d",&no_of_nodes); + + int num_of_blocks = 1; + int num_of_threads_per_block = no_of_nodes; + + //Make execution Parameters according to the number of nodes + //Distribute threads across multiple Blocks if necessary + if(no_of_nodes>MAX_THREADS_PER_BLOCK){ + num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + } + work_group_size = num_of_threads_per_block; + // allocate host memory + h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); + h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes); + + int start, edgeno; + // initalize the memory + for(int i = 0; i < no_of_nodes; i++){ + fscanf(fp,"%d %d",&start,&edgeno); + h_graph_nodes[i].starting = start; + h_graph_nodes[i].no_of_edges = edgeno; + h_graph_mask[i]=false; + h_updating_graph_mask[i]=false; + h_graph_visited[i]=false; + } + //read the source node from the file + fscanf(fp,"%d",&source); + source=0; + //set the source node as true in the mask + h_graph_mask[source]=true; + h_graph_visited[source]=true; + fscanf(fp,"%d",&edge_list_size); + int id,cost; + int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size); + for(int i=0; i < edge_list_size ; i++){ + fscanf(fp,"%d",&id); + fscanf(fp,"%d",&cost); + h_graph_edges[i] = id; + } + + if(fp) + fclose(fp); + // allocate mem for the result on host side + int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); + int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); + for(int i=0;i(h_cost_ref, h_cost, no_of_nodes); + //release host memory + free(h_graph_nodes); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + + } + catch(std::string msg){ + std::cout<<"--cambine: exception in main ->"< +#include +#include +#include + +#include "timer.h" + + +using namespace std; + +double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz(); + + +double timer::get_CPU_speed_in_MHz() +{ +#if defined __linux__ + ifstream infile("/proc/cpuinfo"); + char buffer[256], *colon; + + while (infile.good()) { + infile.getline(buffer, 256); + + if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0) + return atof(colon + 2); + } +#endif + + return 0.0; +} + + +void timer::print_time(ostream &str, const char *which, double time) const +{ + static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 }; + const char **unit = units; + + time = 1000.0 * time / CPU_speed_in_MHz; + + while (time >= 999.5 && unit[1] != 0) { + time /= 1000.0; + ++ unit; + } + + str << which << " = " << setprecision(3) << setw(4) << time << *unit; +} + + +ostream &timer::print(ostream &str) +{ + str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right; + + if (CPU_speed_in_MHz == 0) + str << "could not determine CPU speed\n"; + else if (count > 0) { + double total = static_cast(total_time); + + print_time(str, "avg", total / static_cast(count)); + print_time(str, ", total", total); + str << ", count = " << setw(9) << count << '\n'; + } + else + str << "not used\n"; + + return str; +} + + +ostream &operator << (ostream &str, class timer &timer) +{ + return timer.print(str); +} + +double timer::getTimeInSeconds() +{ + double total = static_cast(total_time); + double res = (total / 1000000.0) / CPU_speed_in_MHz; + return res; +} diff --git a/benchmarks/opencl/bfs/timer.h b/benchmarks/opencl/bfs/timer.h new file mode 100755 index 00000000..e5efdc18 --- /dev/null +++ b/benchmarks/opencl/bfs/timer.h @@ -0,0 +1,128 @@ +#ifndef timer_h +#define timer_h + +#include + + +class timer { + public: + timer(const char *name = 0); + timer(const char *name, std::ostream &write_on_exit); + + ~timer(); + + void start(), stop(); + void reset(); + std::ostream &print(std::ostream &); + + double getTimeInSeconds(); + + private: + void print_time(std::ostream &, const char *which, double time) const; + + union { + long long total_time; + struct { +#if defined __PPC__ + int high, low; +#else + int low, high; +#endif + }; + }; + + unsigned long long count; + const char *const name; + std::ostream *const write_on_exit; + + static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); +}; + + +std::ostream &operator << (std::ostream &, class timer &); + + +inline void timer::reset() +{ + total_time = 0; + count = 0; +} + + +inline timer::timer(const char *name) +: + name(name), + write_on_exit(0) +{ + reset(); +} + + +inline timer::timer(const char *name, std::ostream &write_on_exit) +: + name(name), + write_on_exit(&write_on_exit) +{ + reset(); +} + + +inline timer::~timer() +{ + if (write_on_exit != 0) + print(*write_on_exit); +} + + +inline void timer::start() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time -= ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "subl %%eax, %0\n\t" + "sbbl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#else +#error Compiler/Architecture not recognized +#endif +} + + +inline void timer::stop() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time += ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "addl %%eax, %0\n\t" + "adcl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#endif + + ++ count; +} + +#endif diff --git a/benchmarks/opencl/bfs/util.h b/benchmarks/opencl/bfs/util.h new file mode 100755 index 00000000..425edfba --- /dev/null +++ b/benchmarks/opencl/bfs/util.h @@ -0,0 +1,72 @@ +#ifndef _C_UTIL_ +#define _C_UTIL_ +#include +#include + +//------------------------------------------------------------------- +//--initialize array with maximum limit +//------------------------------------------------------------------- +template +void fill(datatype *A, const int n, const datatype maxi){ + for (int j = 0; j < n; j++) + { + A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f))); + } +} + +//--print matrix +template +void print_matrix(datatype *A, int height, int width){ + for(int i=0; i +void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){ + + char passed = true; +#pragma omp parallel for + for (int i=0; i MAX_RELATIVE_ERROR){ + passed = false; + } + } + if (passed){ + std::cout << "--cambine:passed:-)" << endl; + } + else{ + std::cout << "--cambine: failed:-(" << endl; + } + return ; +} +template +void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){ + + char passed = true; +//#pragma omp parallel for + for (int i=0; i cBuf; - // pthread_mutex_t cBufLock; - // }; + void poll() {} + }; class DiskControllerMemDevice : public MemDevice { public: diff --git a/emulator/instruction.cpp b/emulator/instruction.cpp index 60b47626..e3e0882d 100644 --- a/emulator/instruction.cpp +++ b/emulator/instruction.cpp @@ -578,9 +578,15 @@ void Instruction::executeOn(Warp &c) { reg[rdest] = ((immsrc << 12) & 0xfffff000) + (c.pc - 4); break; case JAL_INST: - //std::cout << "JAL_INST\n"; + std::cout << "JAL_INST\n"; if (!pcSet) nextPc = (c.pc - 4) + immsrc; - if (!pcSet) {/*std::cout << "JAL... SETTING PC: " << nextPc << "\n"; */} + + if (!pcSet) + { + std::cout << "JAL... immsrc: " << hex << immsrc << "\n"; + std::cout << "JAL... pc base: " << hex << (c.pc - 4) << "\n"; + std::cout << "JAL... SETTING PC: " << nextPc << "\n"; + } if (rdest != 0) { reg[rdest] = c.pc; diff --git a/emulator/instruction.o b/emulator/instruction.o index af5ce43c..0b7430f1 100644 Binary files a/emulator/instruction.o and b/emulator/instruction.o differ diff --git a/emulator/libharplib.a b/emulator/libharplib.a index 85501c38..f58011a5 100644 Binary files a/emulator/libharplib.a and b/emulator/libharplib.a differ diff --git a/emulator/libharplib.so b/emulator/libharplib.so index bc71b37e..129bff5e 100755 Binary files a/emulator/libharplib.so and b/emulator/libharplib.so differ diff --git a/emulator/mem.o b/emulator/mem.o index 5c2520c3..9887ab2b 100644 Binary files a/emulator/mem.o and b/emulator/mem.o differ diff --git a/emulator/test_riscv.sh b/emulator/test_riscv.sh index 03130780..08fb9eb5 100755 --- a/emulator/test_riscv.sh +++ b/emulator/test_riscv.sh @@ -2,4 +2,5 @@ echo start > results.txt # echo ../kernel/vortex_test.hex ./harptool -E -a rv32i --core ../runtime/mains/simple/vx_simple_main.hex -s -b 1> emulator.debug +# ./harptool -E -a rv32i --core ../benchmarks/opencl/sgemm/sgemm.hex -s -b 1> emulator.debug # ./harptool -E -a rv32i --core ../runtime/mains/vector_test/vx_vector_main.hex -s -b 1> emulator.debug diff --git a/rtl/VX_alu.v b/rtl/VX_alu.v index 3b308297..9688aad2 100644 --- a/rtl/VX_alu.v +++ b/rtl/VX_alu.v @@ -1,4 +1,3 @@ - `include "VX_define.v" module VX_alu( @@ -13,6 +12,71 @@ module VX_alu( ); + `ifdef SYN_FUNC + wire which_in2; + + wire[31:0] ALU_in1; + wire[31:0] ALU_in2; + wire[63:0] ALU_in1_mult; + wire[63:0] ALU_in2_mult; + wire[31:0] upper_immed; + wire[31:0] div_result; + wire[31:0] rem_result; + + + assign which_in2 = in_rs2_src == `RS2_IMMED; + + assign ALU_in1 = in_1; + + assign ALU_in2 = which_in2 ? in_itype_immed : in_2; + + + assign upper_immed = {in_upper_immed, {12{1'b0}}}; + + + + //always @(posedge `MUL) begin + + + /* verilator lint_off UNUSED */ + + + wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1}; + wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2}; + assign ALU_in1_mult = (in_alu_op == `MULHU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in1} : alu_in1_signed; + assign ALU_in2_mult = (in_alu_op == `MULHU || in_alu_op == `MULHSU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in2} : alu_in2_signed; + wire[63:0] mult_result = ALU_in1_mult * ALU_in2_mult; + + /* verilator lint_on UNUSED */ + + always @(in_alu_op or ALU_in1 or ALU_in2) begin + case(in_alu_op) + `ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2); + `SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2); + `SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0]; + `SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0; + `SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0; + `XOR: out_alu_result = ALU_in1 ^ ALU_in2; + `SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0]; + `SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0]; + `OR: out_alu_result = ALU_in1 | ALU_in2; + `AND: out_alu_result = ALU_in2 & ALU_in1; + `SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff; + `LUI_ALU: out_alu_result = upper_immed; + `AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed); + `MUL: out_alu_result = mult_result[31:0]; + `MULH: out_alu_result = mult_result[63:32]; + `MULHSU: out_alu_result = mult_result[63:32]; + `MULHU: out_alu_result = mult_result[63:32]; + `DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : $signed($signed(ALU_in1) / $signed(ALU_in2)); + `DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : ALU_in1 / ALU_in2; + `REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : $signed($signed(ALU_in1) % $signed(ALU_in2)); + `REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2; + default: out_alu_result = 32'h0; + endcase // in_alu_op + end + + `else wire which_in2; wire[31:0] ALU_in1; @@ -69,7 +133,7 @@ module VX_alu( `REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2; default: out_alu_result = 32'h0; endcase // in_alu_op - end - + end + `endif endmodule // VX_alu \ No newline at end of file diff --git a/rtl/VX_define.v b/rtl/VX_define.v index 8f77fdb3..809ff759 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -1,18 +1,18 @@ +`include "./VX_define_synth.v" + + -`define NT 4 `define NT_M1 (`NT-1) // NW_M1 is actually log2(NW) -//`define NW_M1 (4-1) - -`define NW 8 `define NW_M1 (`CLOG2(`NW)) // Uncomment the below line if NW=1 // `define ONLY // `define SYN 1 -//`define ASIC 1 +// `define ASIC 1 +`define SYN_FUNC 1 `define NUM_BARRIERS 4 diff --git a/rtl/VX_define_synth.v b/rtl/VX_define_synth.v new file mode 100644 index 00000000..0444fe94 --- /dev/null +++ b/rtl/VX_define_synth.v @@ -0,0 +1,2 @@ +`define NT 4 +`define NW 8 diff --git a/rtl/VX_gpr.v b/rtl/VX_gpr.v index 890e3041..6f239c51 100644 --- a/rtl/VX_gpr.v +++ b/rtl/VX_gpr.v @@ -85,83 +85,87 @@ module VX_gpr ( wire[`NT_M1:0][31:0] to_write = (VX_writeback_inter.rd != 0) ? VX_writeback_inter.write_data : 0; - /* verilator lint_off PINCONNECTEMPTY */ - rf2_32x128_wm1 first_ram ( - .CENYA(), - .AYA(), - .CENYB(), - .WENYB(), - .AYB(), - .QA(temp_a), - .SOA(), - .SOB(), - .CLKA(clk), - .CENA(cena_1), - .AA(VX_gpr_read.rs1), - .CLKB(clk), - .CENB(cenb), - .WENB(write_bit_mask), - .AB(VX_writeback_inter.rd), - .DB(to_write), - .EMAA(3'b011), - .EMASA(1'b0), - .EMAB(3'b011), - .TENA(1'b1), - .TCENA(1'b0), - .TAA(5'b0), - .TENB(1'b1), - .TCENB(1'b0), - .TWENB(128'b0), - .TAB(5'b0), - .TDB(128'b0), - .RET1N(1'b1), - .SIA(2'b0), - .SEA(1'b0), - .DFTRAMBYP(1'b0), - .SIB(2'b0), - .SEB(1'b0), - .COLLDISN(1'b1) - ); - /* verilator lint_on PINCONNECTEMPTY */ + genvar curr_base_thread; + for (curr_base_thread = 0; curr_base_thread < 'NT; curr_base_thread=curr_base_thread+4) + begin + /* verilator lint_off PINCONNECTEMPTY */ + rf2_32x128_wm1 first_ram ( + .CENYA(), + .AYA(), + .CENYB(), + .WENYB(), + .AYB(), + .QA(temp_a[(curr_base_thread+3):(curr_base_thread)]), + .SOA(), + .SOB(), + .CLKA(clk), + .CENA(cena_1), + .AA(VX_gpr_read.rs1[(curr_base_thread+3):(curr_base_thread)]), + .CLKB(clk), + .CENB(cenb), + .WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]), + .AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]), + .DB(to_write[(curr_base_thread+3):(curr_base_thread)]), + .EMAA(3'b011), + .EMASA(1'b0), + .EMAB(3'b011), + .TENA(1'b1), + .TCENA(1'b0), + .TAA(5'b0), + .TENB(1'b1), + .TCENB(1'b0), + .TWENB(128'b0), + .TAB(5'b0), + .TDB(128'b0), + .RET1N(1'b1), + .SIA(2'b0), + .SEA(1'b0), + .DFTRAMBYP(1'b0), + .SIB(2'b0), + .SEB(1'b0), + .COLLDISN(1'b1) + ); + /* verilator lint_on PINCONNECTEMPTY */ - /* verilator lint_off PINCONNECTEMPTY */ - rf2_32x128_wm1 second_ram ( - .CENYA(), - .AYA(), - .CENYB(), - .WENYB(), - .AYB(), - .QA(temp_b), - .SOA(), - .SOB(), - .CLKA(clk), - .CENA(cena_2), - .AA(VX_gpr_read.rs2), - .CLKB(clk), - .CENB(cenb), - .WENB(write_bit_mask), - .AB(VX_writeback_inter.rd), - .DB(to_write), - .EMAA(3'b011), - .EMASA(1'b0), - .EMAB(3'b011), - .TENA(1'b1), - .TCENA(1'b0), - .TAA(5'b0), - .TENB(1'b1), - .TCENB(1'b0), - .TWENB(128'b0), - .TAB(5'b0), - .TDB(128'b0), - .RET1N(1'b1), - .SIA(2'b0), - .SEA(1'b0), - .DFTRAMBYP(1'b0), - .SIB(2'b0), - .SEB(1'b0), - .COLLDISN(1'b1) - ); - /* verilator lint_on PINCONNECTEMPTY */ + /* verilator lint_off PINCONNECTEMPTY */ + rf2_32x128_wm1 second_ram ( + .CENYA(), + .AYA(), + .CENYB(), + .WENYB(), + .AYB(), + .QA(temp_b[(curr_base_thread+3):(curr_base_thread)]), + .SOA(), + .SOB(), + .CLKA(clk), + .CENA(cena_2), + .AA(VX_gpr_read.rs2[(curr_base_thread+3):(curr_base_thread)]), + .CLKB(clk), + .CENB(cenb), + .WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]), + .AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]), + .DB(to_write[(curr_base_thread+3):(curr_base_thread)]), + .EMAA(3'b011), + .EMASA(1'b0), + .EMAB(3'b011), + .TENA(1'b1), + .TCENA(1'b0), + .TAA(5'b0), + .TENB(1'b1), + .TCENB(1'b0), + .TWENB(128'b0), + .TAB(5'b0), + .TDB(128'b0), + .RET1N(1'b1), + .SIA(2'b0), + .SEA(1'b0), + .DFTRAMBYP(1'b0), + .SIB(2'b0), + .SEB(1'b0), + .COLLDISN(1'b1) + ); + /* verilator lint_on PINCONNECTEMPTY */ + end `endif diff --git a/rtl/VX_writeback.v b/rtl/VX_writeback.v index 037f7d2e..c9616d43 100644 --- a/rtl/VX_writeback.v +++ b/rtl/VX_writeback.v @@ -63,14 +63,40 @@ module VX_writeback ( wire zero = 0; + wire[`NT-1:0][31:0] use_wb_data; + + reg prev_is_mem; + + always @(posedge clk, posedge reset) begin + if (reset) + begin + prev_is_mem = 0; + end begin + prev_is_mem = mem_wb && !no_slot_mem; + end + end + VX_generic_register #(.N(39 + `NW_M1 + 1 + `NT*33)) wb_register( .clk (clk), .reset(reset), .stall(zero), .flush(zero), .in ({VX_writeback_tempp.write_data, VX_writeback_tempp.wb_valid, VX_writeback_tempp.rd, VX_writeback_tempp.wb, VX_writeback_tempp.wb_warp_num, VX_writeback_tempp.wb_pc}), - .out ({VX_writeback_inter.write_data, VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc}) + .out ({use_wb_data , VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc}) ); + `ifdef SYN + assign VX_writeback_inter.write_data = prev_is_mem ? VX_writeback_tempp.write_data : use_wb_data; + `else + assign VX_writeback_inter.write_data = use_wb_data; + `endif + + +endmodule // VX_writeback + + + + + + -endmodule // VX_writeback \ No newline at end of file diff --git a/rtl/cache/VX_d_cache.v b/rtl/cache/VX_d_cache.v index fd6c9641..78b407f7 100644 --- a/rtl/cache/VX_d_cache.v +++ b/rtl/cache/VX_d_cache.v @@ -304,9 +304,15 @@ module VX_d_cache // 0; wire[1:0] byte_select = bank_addr[1:0]; + wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START]; + + `ifdef SYN_FUNC + wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = 0; + wire[IND_SIZE_END:IND_SIZE_START] cache_index = 0; + `else wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = bank_addr[ADDR_OFFSET_END:ADDR_OFFSET_START]; wire[IND_SIZE_END:IND_SIZE_START] cache_index = bank_addr[ADDR_IND_END:ADDR_IND_START]; - wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START]; + `endif wire normal_valid_in = valid_per_bank[bank_id]; diff --git a/rtl/modelsim/Makefile b/rtl/modelsim/Makefile index e906236f..7a3a4efd 100644 --- a/rtl/modelsim/Makefile +++ b/rtl/modelsim/Makefile @@ -7,6 +7,7 @@ SRC = \ vortex_dpi.cpp \ vortex_tb.v \ ../VX_define.v \ +../VX_define_synth.v \ ../interfaces/VX_branch_response_inter.v \ ../interfaces/VX_csr_req_inter.v \ ../interfaces/VX_csr_wb_inter.v \ @@ -78,7 +79,9 @@ SRC = \ ../../models/memory/cln28hpm/rf2_128x128_wm1/rf2_128x128_wm1.v \ ../../models/memory/cln28hpm/rf2_256x128_wm1/rf2_256x128_wm1.v \ ../../models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0.v \ -../../models/memory/cln28hpm/rf2_32x128_wm1/rf2_32x128_wm1.v +../../models/memory/cln28hpm/rf2_32x128_wm1/rf2_32x128_wm1.v \ +../../models/memory/cln28hpm/rf2_32x19_wm0/rf2_32x19_wm0.v + # ../../models/memory/cln28hpc/rf2_32x128_wm1/rf2_32x128_wm1.v # vortex_dpi.h diff --git a/runtime/mains/vecadd/vecadd.cl b/runtime/mains/vecadd/vecadd.cl new file mode 100644 index 00000000..92292044 --- /dev/null +++ b/runtime/mains/vecadd/vecadd.cl @@ -0,0 +1,20 @@ + +#include "../../intrinsics/vx_intrinsics.h" + +kernel void +vecadd (__global const int *a, + __global const int *b, + __global int *c) +{ + int gid = get_global_id(0); + + __if (gid < 2) + { + c[gid] = a[gid] + b[gid]; + } + __else + { + c[gid] = b[gid] - a[gid]; + } + __endif +} diff --git a/sftp-config.json b/sftp-config.json index 01f84e46..80316a7e 100644 --- a/sftp-config.json +++ b/sftp-config.json @@ -5,13 +5,13 @@ // sftp, ftp or ftps "type": "sftp", - "save_before_upload": true, - "upload_on_save": true, + "save_before_upload": false, + "upload_on_save": false, "sync_down_on_open": false, "sync_skip_deletes": false, - "sync_same_age": true, + "sync_same_age": false, "confirm_downloads": false, - "confirm_sync": true, + "confirm_sync": false, "confirm_overwrite_newer": false, "host": "ece-rschsrv01.ece.gatech.edu", diff --git a/simX/core.cpp b/simX/core.cpp index cd649ed5..8a5a507e 100644 --- a/simX/core.cpp +++ b/simX/core.cpp @@ -405,11 +405,11 @@ void Core::fetch() printTrace(&inst_in_fetch, "Fetch"); // #ifdef PRINT_ACTIVE_THREADS - // for (unsigned j = 0; j < w[schedule_w].tmask.size(); ++j) { - // if (w[schedule_w].activeThreads > j && w[schedule_w].tmask[j]) cout << " 1"; - // else cout << " 0"; - // if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) cout << ','; - // } + for (unsigned j = 0; j < w[schedule_w].tmask.size(); ++j) { + if (w[schedule_w].activeThreads > j && w[schedule_w].tmask[j]) cout << " 1"; + else cout << " 0"; + if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) cout << ','; + } // #endif @@ -430,7 +430,7 @@ void Core::decode() INIT_TRACE(inst_in_fetch); } - printTrace(&inst_in_decode, "Decode"); + //printTrace(&inst_in_decode, "Decode"); } void Core::scheduler() @@ -442,7 +442,7 @@ void Core::scheduler() INIT_TRACE(inst_in_decode); } - printTrace(&inst_in_scheduler, "scheduler"); + //printTrace(&inst_in_scheduler, "scheduler"); } void Core::load_store() @@ -496,7 +496,7 @@ void Core::load_store() if (inst_in_lsu.mem_stall_cycles > 0) inst_in_lsu.mem_stall_cycles--; - printTrace(&inst_in_lsu, "LSU"); + //printTrace(&inst_in_lsu, "LSU"); } void Core::execute_unit() @@ -548,7 +548,7 @@ void Core::execute_unit() // } - printTrace(&inst_in_exe, "execute_unit"); + //printTrace(&inst_in_exe, "execute_unit"); // INIT_TRACE(inst_in_exe); } @@ -604,7 +604,7 @@ void Core::writeback() // if (!serviced_exe && !serviced_mem) INIT_TRACE(inst_in_wb); - printTrace(&inst_in_wb, "Writeback"); + //printTrace(&inst_in_wb, "Writeback"); } @@ -712,12 +712,12 @@ void Warp::step(trace_inst_t * trace_inst) { // At Debug Level 3, print debug info after each instruction. - #ifdef USE_DEBUG - if (USE_DEBUG >= 3) { + // #ifdef USE_DEBUG + // if (USE_DEBUG >= 3) { D(3, "Register state:"); for (unsigned i = 0; i < reg[0].size(); ++i) { D_RAW(" %r" << setfill(' ') << setw(2) << dec << i << ':'); - for (unsigned j = 0; j < reg.size(); ++j) + for (unsigned j = 0; j < (this->activeThreads); ++j) D_RAW(' ' << setfill('0') << setw(8) << hex << reg[j][i] << setfill(' ') << ' '); D_RAW('(' << shadowReg[i] << ')' << endl); } @@ -729,8 +729,8 @@ void Warp::step(trace_inst_t * trace_inst) { D_RAW(endl); D_RAW(endl); D_RAW(endl); - } - #endif + // } + // #endif // Clean up. delete inst; diff --git a/simX/enc.cpp b/simX/enc.cpp index 16007652..09a3253c 100644 --- a/simX/enc.cpp +++ b/simX/enc.cpp @@ -219,7 +219,13 @@ Instruction *WordDecoder::decode(const std::vector &v, Size &idx, trace_in imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); - inst.setSrcImm(signExt(imeed, 20, j_immed_mask)); + if (bit_20) + { + imeed |= ~j_immed_mask; + } + + // inst.setSrcImm(signExt(imeed, 20, j_immed_mask)); + inst.setSrcImm(imeed); usedImm = true; trace_inst->valid_inst = true; diff --git a/simX/instruction.cpp b/simX/instruction.cpp index c1b21760..12166f32 100644 --- a/simX/instruction.cpp +++ b/simX/instruction.cpp @@ -1508,6 +1508,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { case 2: { Word VLMAX = (c.vtype.vlmul * c.VLEN)/c.vtype.vsew; + switch(func6){ case 24: //vmandnot { @@ -1532,6 +1533,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { *result_ptr = 0; } + } else if(c.vtype.vsew == 16) { for(uint16_t i = 0; i < c.vl; i++){ uint16_t *first_ptr = (uint16_t *)vr1[i].val; @@ -1549,6 +1551,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { *result_ptr = 0; } + } else if(c.vtype.vsew == 32) { for(uint32_t i = 0; i < c.vl; i++){ uint32_t *first_ptr = (uint32_t *)vr1[i].val; @@ -1565,6 +1568,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint32_t *result_ptr = (uint32_t *) vd[i].val; *result_ptr = 0; } + } } break; @@ -1603,6 +1607,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint16_t * result_ptr = (uint16_t *) vd[i].val; *result_ptr = result; } + for(uint16_t i = c.vl; i < VLMAX; i++){ uint16_t *result_ptr = (uint16_t *) vd[i].val; *result_ptr = 0; @@ -1620,6 +1625,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint32_t * result_ptr = (uint32_t *) vd[i].val; *result_ptr = result; } + for(Word i = c.vl; i < VLMAX; i++){ uint32_t *result_ptr = (uint32_t *) vd[i].val; *result_ptr = 0; @@ -1667,7 +1673,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { result_ptr = (uint16_t *) vd[i].val; *result_ptr = 0; } - } else if(c.vtype.vsew == 32) { uint32_t *result_ptr; for(uint32_t i = 0; i < c.vl; i++){ @@ -1692,11 +1697,11 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { case 27: //vmxor { D(3, "vmxor"); - uint8_t *result_ptr; vector> vr1 = c.vreg[rsrc[0]]; vector> vr2 = c.vreg[rsrc[1]]; vector> vd = c.vreg[rdest]; if(c.vtype.vsew == 8){ + uint8_t *result_ptr; for(uint8_t i = 0; i < c.vl; i++){ uint8_t *first_ptr = (uint8_t *)vr1[i].val; uint8_t *second_ptr = (uint8_t *)vr2[i].val; @@ -1704,7 +1709,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint8_t second_value = (*second_ptr & 0x1); uint8_t result = (first_value ^ second_value); cout << "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result << '\n'; - result_ptr = (uint8_t *) vd[i].val; *result_ptr = result; } @@ -1712,7 +1716,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { result_ptr = (uint8_t *) vd[i].val; *result_ptr = 0; } - } else if(c.vtype.vsew == 16) { uint16_t *result_ptr; for(uint16_t i = 0; i < c.vl; i++){ @@ -1733,6 +1736,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { } else if(c.vtype.vsew == 32) { uint32_t *result_ptr; + for(uint32_t i = 0; i < c.vl; i++){ uint32_t *first_ptr = (uint32_t *)vr1[i].val; uint32_t *second_ptr = (uint32_t *)vr2[i].val; @@ -1773,7 +1777,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint8_t *result_ptr = (uint8_t *) vd[i].val; *result_ptr = 0; } - } else if(c.vtype.vsew == 16) { for(uint16_t i = 0; i < c.vl; i++){ uint16_t *first_ptr = (uint16_t *)vr1[i].val; @@ -1845,6 +1848,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint16_t * result_ptr = (uint16_t *) vd[i].val; *result_ptr = result; } + for(uint16_t i = c.vl; i < VLMAX; i++){ uint16_t *result_ptr = (uint16_t *) vd[i].val; *result_ptr = 0; @@ -1862,10 +1866,12 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint32_t * result_ptr = (uint32_t *) vd[i].val; *result_ptr = result; } + for(Word i = c.vl; i < VLMAX; i++){ uint32_t *result_ptr = (uint32_t *) vd[i].val; *result_ptr = 0; } + } } break; @@ -1877,6 +1883,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { vector> vd = c.vreg[rdest]; if(c.vtype.vsew == 8){ uint8_t *result_ptr; + for(uint8_t i = 0; i < c.vl; i++){ uint8_t *first_ptr = (uint8_t *)vr1[i].val; uint8_t *second_ptr = (uint8_t *)vr2[i].val; @@ -1892,7 +1899,6 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { result_ptr = (uint8_t *) vd[i].val; *result_ptr = 0; } - } else if(c.vtype.vsew == 16) { for(uint16_t i = 0; i < c.vl; i++){ uint16_t *first_ptr = (uint16_t *)vr1[i].val; @@ -1927,6 +1933,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { uint32_t *result_ptr = (uint32_t *) vd[i].val; *result_ptr = 0; } + } } break; @@ -1934,6 +1941,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { { D(3, "vmxnor"); uint8_t *result_ptr; + vector> vr1 = c.vreg[rsrc[0]]; vector> vr2 = c.vreg[rsrc[1]]; vector> vd = c.vreg[rdest]; @@ -1953,7 +1961,8 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { result_ptr = (uint8_t *) vd[i].val; *result_ptr = 0; } - } else if(c.vtype.vsew == 16) { + } + else if(c.vtype.vsew == 16) { uint16_t *result_ptr; for(uint16_t i = 0; i < c.vl; i++){ uint16_t *first_ptr = (uint16_t *)vr1[i].val; @@ -1973,6 +1982,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { } else if(c.vtype.vsew == 32) { uint32_t *result_ptr; + for(uint32_t i = 0; i < c.vl; i++){ uint32_t *first_ptr = (uint32_t *)vr1[i].val; uint32_t *second_ptr = (uint32_t *)vr2[i].val; @@ -1988,6 +1998,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { result_ptr = (uint32_t *) vd[i].val; *result_ptr = 0; } + } } break; diff --git a/syn/Makefile b/syn/Makefile index 1cfe585c..ca3094bc 100644 --- a/syn/Makefile +++ b/syn/Makefile @@ -1,7 +1,33 @@ -all: syn +SCRIPT_DIR=./scripts + +all: dc -syn: - dc_shell-t -f fsyn.tcl 2>&1 | tee vortex_syn.log \ No newline at end of file +#syn: + #dc_shell-t -f esyn.tcl 2>&1 | tee vortex_syn.log + #dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log + #dc_shell -f $(SCRIPT_DIR)/dc/dc_script.tcl + +dc: + rm -rf rpt + mkdir rpt + dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log + +clean: + rm -f simv + rm -f *.vcd + rm -f *.key + rm -rf csrc/ + rm -rf *.rpt + rm -rf *.log + rm -rf *.svf + rm -rf *.ddc + rm -rf results_synthesized.v + rm -rf results_synthesized.sdc + rm -rf alib-52/ + rm -rf rpt/ + rm -rf simv.daidir/ + rm -rf encounter* + rm -rf ./synth_out \ No newline at end of file diff --git a/syn/esyn.tcl b/syn/esyn.tcl new file mode 100644 index 00000000..10fa09d9 --- /dev/null +++ b/syn/esyn.tcl @@ -0,0 +1,53 @@ +#set search_path [concat /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_db/ /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_hard_db/ ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db] +set search_path [concat ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db] +set link_library [concat ./NanGate_15nm_OCL.db] +set symbol_library {} +set target_library [concat ./NanGate_15nm_OCL.db] + +set verilog_files [ list VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v VX_cache_data_per_index.v VX_Cache_Bank.v VX_cache_data.v VX_d_cache.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_define_synth.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v Vortex.v VX_cache_bank_valid.v \ + ] +# set verilog_files [ list Vortex.v VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v cache_set.v VX_Cache_Bank.v VX_Cache_Block_DM.v VX_cache_data.v VX_d_cache.v VX_generic_pc.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ +# ] + +set top_level Vortex +analyze -format sverilog $verilog_files +#analyze -format sverilog -error=LINT-66 $verilog_files +elaborate Vortex +link + +set clk_freq 0.4 +set clk_period [expr 1000.0 / $clk_freq / 1.0] +create_clock [get_ports clk] -period $clk_period +set_max_fanout 20 [get_ports clk] +set_ideal_network [get_ports clk] + +set_max_fanout 20 [get_ports reset] +set_false_path -from [get_ports reset] +all_high_fanout -net -threshold 20 + +# set_register_merging Vortex FALSE +# set compile_seqmap_propagate_constants false +# set compile_seqmap_propagate_high_effort false + +check_design +compile_ultra -no_autoungroup +ungroup -all -flatten +uniquify + +define_name_rules verilog -remove_internal_net_bus -remove_port_bus +change_names -rule verilog -hierarchy + +# report_qor +report_area +report_hierarchy +report_cell +report_reference +report_port +report_power + +write -hierarchy -format verilog -output Vortex.netlist.v +remove_ideal_network [get_ports clk] +set_propagated_clock [get_ports clk] +write_sdc -version 1.9 Vortex.sdc +write_file -format ddc -output Vortex.ddc +exit \ No newline at end of file diff --git a/syn/run_mult_synth.sh b/syn/run_mult_synth.sh new file mode 100644 index 00000000..38a32f9a --- /dev/null +++ b/syn/run_mult_synth.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set top_level = Vortex + +source /tools/synopsys/synthesis/j201409/cshrc.syn +set cur_dir = `pwd` +echo $cur_dir + +for number_of_warps in 2 4 8 16 32; do + for number_of_threads in 2 4 8 16 32; do + + echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Launched" + echo "\`define NT $number_of_threads" > ../rtl/VX_define_synth.v + echo "\`define NW $number_of_warps" >> ../rtl/VX_define_synth.v + make dc | tee run.log 1>/dev/null + sleep 30 + moved_filename="${number_of_warps}_Warps__${number_of_threads}_threads__400MHz.log" + mv ./vortex_syn.log ../../$moved_filename + sleep 30 + + + + + echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Finished" + done +done + + +echo "Done!"