diff --git a/.travis.yml b/.travis.yml index a30782e5..81950c5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,10 +18,10 @@ install: - export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain - export VERILATOR_ROOT=/opt/verilator - export PATH=$VERILATOR_ROOT/bin:$PATH - - make -s - + script: - ./ci/regression.sh + - ./ci/test_compiler.sh after_success: # Gather code coverage diff --git a/benchmarks/opencl/bfs/CLHelper.h b/benchmarks/opencl/bfs/CLHelper.h index 94536d7c..6ac9c609 100755 --- a/benchmarks/opencl/bfs/CLHelper.h +++ b/benchmarks/opencl/bfs/CLHelper.h @@ -233,7 +233,7 @@ free(allPlatforms);*/ //--cambine-4: Create an OpenCL command queue oclHandles.queue = clCreateCommandQueue( oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL); - printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue); + //printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue); if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL)) throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); @@ -383,8 +383,8 @@ void _clRelease() { errorFlag = true; } oclHandles.kernel[nKernel] = NULL; + printf("clReleaseKernel()\n"); } - oclHandles.kernel.clear(); } if (oclHandles.program != NULL) { @@ -394,6 +394,7 @@ void _clRelease() { errorFlag = true; } oclHandles.program = NULL; + printf("clReleaseProgram()\n"); } if (oclHandles.queue != NULL) { @@ -403,10 +404,9 @@ void _clRelease() { errorFlag = true; } oclHandles.queue = NULL; + printf("clReleaseCommandQueue()\n"); } - free(oclHandles.devices); - if (oclHandles.context != NULL) { cl_int resultCL = clReleaseContext(oclHandles.context); if (resultCL != CL_SUCCESS) { @@ -414,6 +414,17 @@ void _clRelease() { errorFlag = true; } oclHandles.context = NULL; + printf("clReleaseContext()\n"); + } + + if (oclHandles.devices != NULL) { + cl_int resultCL = clReleaseDevice(oclHandles.devices[0]); + if (resultCL != CL_SUCCESS) { + cerr << "ReleaseCL()::Error: In clReleaseDevice" << endl; + errorFlag = true; + } + free(oclHandles.devices); + printf("clReleaseDevice()\n"); } if (errorFlag) @@ -675,7 +686,7 @@ void _clFinish() throw(string) { void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string) { cl_uint work_dim = WORK_DIM; - cl_event e[1]; + //cl_event e[1]; if (work_items % work_group_size != 0) // process situations that work_items // cannot be divided by work_group_size work_items = @@ -684,7 +695,7 @@ void _clInvokeKernel(int kernel_id, int work_items, size_t global_work_size[] = {work_items, 1}; oclHandles.cl_status = clEnqueueNDRangeKernel( oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, - global_work_size, local_work_size, 0, 0, &(e[0])); + global_work_size, local_work_size, 0, 0, NULL); #ifdef ERRMSG oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; switch (oclHandles.cl_status) { @@ -749,13 +760,13 @@ void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, cl_uint work_dim = WORK_DIM; size_t local_work_size[] = {group_x, group_y}; size_t global_work_size[] = {range_x, range_y}; - cl_event e[1]; + //cl_event e[1]; /*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size work_items = work_items + (work_group_size-(work_items%work_group_size));*/ oclHandles.cl_status = clEnqueueNDRangeKernel( oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, - global_work_size, local_work_size, 0, 0, &(e[0])); + global_work_size, local_work_size, 0, 0, NULL); #ifdef ERRMSG oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; switch (oclHandles.cl_status) { diff --git a/benchmarks/opencl/bfs/main.cc b/benchmarks/opencl/bfs/main.cc index 701209d4..1a1cf1d2 100755 --- a/benchmarks/opencl/bfs/main.cc +++ b/benchmarks/opencl/bfs/main.cc @@ -78,14 +78,15 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, char h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, d_graph_visited, d_cost, d_over; + try { //--1 transfer data from host to device _clInit(); + d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask); - d_updating_graph_mask = - _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask); + d_updating_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited); d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost); @@ -94,8 +95,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask); - _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), - h_updating_graph_mask); + _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost); @@ -106,6 +106,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, kernel_timer.reset(); kernel_timer.start(); #endif + do { h_over = false; _clMemcpyH2D(d_over, sizeof(char), &h_over); @@ -136,9 +137,8 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over, sizeof(char), &h_over); - } while (h_over); + } while (h_over); - _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); diff --git a/benchmarks/opencl/bfs/util.h b/benchmarks/opencl/bfs/util.h index 425edfba..b67abc0d 100755 --- a/benchmarks/opencl/bfs/util.h +++ b/benchmarks/opencl/bfs/util.h @@ -60,10 +60,10 @@ void compare_results(const datatype *cpu_results, const datatype *gpu_results, c } } if (passed){ - std::cout << "--cambine:passed:-)" << endl; + std::cout << "--cambine: passed: -)" << endl; } else{ - std::cout << "--cambine: failed:-(" << endl; + std::cout << "--cambine: failed :-(" << endl; } return ; } diff --git a/benchmarks/opencl/guassian/clutils.cpp b/benchmarks/opencl/guassian/clutils.cpp index 37cabb7d..c977477a 100755 --- a/benchmarks/opencl/guassian/clutils.cpp +++ b/benchmarks/opencl/guassian/clutils.cpp @@ -69,7 +69,7 @@ static cl_uint numPlatforms; //! All discoverable OpenCL devices (one pointer per platform) static cl_device_id* devices = NULL; -static cl_uint* numDevices; +static cl_uint* numDevices = NULL; //! The chosen OpenCL platform static cl_platform_id platform = NULL; @@ -88,7 +88,6 @@ static cl_command_queue commandQueueNoProf = NULL; //! Global status of events static bool eventsEnabled = false; - //------------------------------------------------------- // Initialization and Cleanup //------------------------------------------------------- @@ -239,14 +238,34 @@ static bool eventsEnabled = false; return context; }*/ +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + + cl_context cl_init_context(int platform, int dev,int quiet) { int printInfo=1; if (platform >= 0 && dev >= 0) printInfo = 0; cl_int status; // Used to iterate through the platforms and devices, respectively - cl_uint numPlatforms; - cl_uint numDevices; - + // These will hold the platform and device we select (can potentially be // multiple, but we're just doing one for now) // cl_platform_id platform = NULL; @@ -376,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) { // Getting platform and device information numPlatforms = 1; - numDevices = 1; - int platform_touse = 0; - int device_touse = 0; platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); - devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices); - status = clGetPlatformIDs(1, platforms, NULL); + numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms); + numDevices[0] = 1; + devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]); + + int platform_touse = 0; + int device_touse = 0; + + status = clGetPlatformIDs(numPlatforms, platforms, NULL); cl_errChk(status, "Oops!", true); - status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL); + status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL); cl_errChk(status, "Oops!", true); - context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); + context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status); cl_errChk(status, "Oops!", true); device=devices[device_touse]; -#define PROFILING - #ifdef PROFILING commandQueue = clCreateCommandQueue(context, @@ -400,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) { #else - clCommandQueue = clCreateCommandQueue(clGPUContext, + commandQueue = clCreateCommandQueue(context, devices[device_touse], NULL, &status); #endif // PROFILING @@ -413,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) { /*! Release all resources that the user doesn't have access to. */ -void cl_cleanup() +void cl_cleanup() { + cl_int status; + // Free the command queue - if(commandQueue) { - clReleaseCommandQueue(commandQueue); + if (commandQueue) { + status = clReleaseCommandQueue(commandQueue); + cl_errChk(status, "Oops!", true); + printf("clReleaseCommandQueue()\n"); } // Free the context - if(context) { - clReleaseContext(context); + if (context) { + status = clReleaseContext(context); + cl_errChk(status, "Oops!", true); + printf("clReleaseContext()\n"); + } + + for (int p = 0; p < numPlatforms; ++p) { + for (int d = 0; d < numDevices[p]; ++d) { + status = clReleaseDevice(devices[d]); + cl_errChk(status, "Oops!", true); + printf("clReleaseDevice()\n"); + } } free(devices); free(numDevices); - - // Free the platforms free(platforms); } @@ -443,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel) if(kernel != NULL) { status = clReleaseKernel(kernel); cl_errChk(status, "Releasing kernel object", true); + printf("clReleaseKernel()\n"); } } @@ -457,6 +490,7 @@ void cl_freeMem(cl_mem mem) if(mem != NULL) { status = clReleaseMemObject(mem); cl_errChk(status, "Releasing mem object", true); + printf("clReleaseMemObject()\n"); } } @@ -471,6 +505,7 @@ void cl_freeProgram(cl_program program) if(program != NULL) { status = clReleaseProgram(program); cl_errChk(status, "Releasing program object", true); + printf("clReleaseProgram()\n"); } } @@ -782,27 +817,6 @@ void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size) cl_unmapBuffer(mem, ptr); } -static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { - if (nullptr == filename || nullptr == data || 0 == size) - return -1; - - FILE* fp = fopen(filename, "r"); - if (NULL == fp) { - fprintf(stderr, "Failed to load kernel."); - return -1; - } - fseek(fp , 0 , SEEK_END); - long fsize = ftell(fp); - rewind(fp); - - *data = (uint8_t*)malloc(fsize); - *size = fread(*data, 1, fsize, fp); - - fclose(fp); - - return 0; -} - //------------------------------------------------------- // Program and kernels //------------------------------------------------------- @@ -858,17 +872,17 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos fread(source, 1, size, fp); source[size] = '\0';*/ - // Create the program object - //cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status); - //cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status); - // read kernel binary from file + // read kernel binary from file uint8_t *kernel_bin = NULL; size_t kernel_size; - cl_int binary_status = 0; - status = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size); - cl_errChk(status, "read_kernel_file", true); + cl_int binary_status = 0; + int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size); + cl_errChk(err, "read_kernel_file", true); + + // Create the program object + //cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status); cl_program clProgramReturn = clCreateProgramWithBinary( - context, 1, &device, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status); + context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status); free(kernel_bin); cl_errChk(status, "Creating program", true); @@ -1440,4 +1454,4 @@ char* itoa_portable(int value, char* result, int base) { } return result; -} +} \ No newline at end of file diff --git a/benchmarks/opencl/guassian/main.cc b/benchmarks/opencl/guassian/main.cc index 4972a253..d7e235eb 100755 --- a/benchmarks/opencl/guassian/main.cc +++ b/benchmarks/opencl/guassian/main.cc @@ -76,6 +76,9 @@ int main(int argc, char *argv[]) { free(b); free(finalVec); // OpenClGaussianElimination(context,timing); + + cl_cleanup(); + printf("Passed!\n"); return 0; } @@ -142,7 +145,8 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size, writeTime += eventTime(writeEvent, command_queue); clReleaseEvent(writeEvent); - error = clEnqueueWriteBuffer(command_queue, m_dev, + error = clEnqueueWriteBuffer(command_queue, + m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, @@ -258,6 +262,13 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size, printf("%f\n\n", writeTime + kernelTime + readTime); } + + cl_freeMem(a_dev); + cl_freeMem(b_dev); + cl_freeMem(m_dev); + cl_freeKernel(fan1_kernel); + cl_freeKernel(fan2_kernel); + cl_freeProgram(gaussianElim_program); } float eventTime(cl_event event, cl_command_queue command_queue) { diff --git a/benchmarks/opencl/nearn/clutils.cpp b/benchmarks/opencl/nearn/clutils.cpp index 3c433c0c..c977477a 100755 --- a/benchmarks/opencl/nearn/clutils.cpp +++ b/benchmarks/opencl/nearn/clutils.cpp @@ -69,7 +69,7 @@ static cl_uint numPlatforms; //! All discoverable OpenCL devices (one pointer per platform) static cl_device_id* devices = NULL; -static cl_uint* numDevices; +static cl_uint* numDevices = NULL; //! The chosen OpenCL platform static cl_platform_id platform = NULL; @@ -265,9 +265,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) { if (platform >= 0 && dev >= 0) printInfo = 0; cl_int status; // Used to iterate through the platforms and devices, respectively - cl_uint numPlatforms; - cl_uint numDevices; - + // These will hold the platform and device we select (can potentially be // multiple, but we're just doing one for now) // cl_platform_id platform = NULL; @@ -397,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) { // Getting platform and device information numPlatforms = 1; - numDevices = 1; - int platform_touse = 0; - int device_touse = 0; platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); - devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices); - status = clGetPlatformIDs(1, platforms, NULL); + numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms); + numDevices[0] = 1; + devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]); + + int platform_touse = 0; + int device_touse = 0; + + status = clGetPlatformIDs(numPlatforms, platforms, NULL); cl_errChk(status, "Oops!", true); - status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL); + status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL); cl_errChk(status, "Oops!", true); - context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); + context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status); cl_errChk(status, "Oops!", true); device=devices[device_touse]; -#define PROFILING - #ifdef PROFILING commandQueue = clCreateCommandQueue(context, @@ -421,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) { #else - clCommandQueue = clCreateCommandQueue(clGPUContext, + commandQueue = clCreateCommandQueue(context, devices[device_touse], NULL, &status); #endif // PROFILING @@ -434,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) { /*! Release all resources that the user doesn't have access to. */ -void cl_cleanup() +void cl_cleanup() { + cl_int status; + // Free the command queue - if(commandQueue) { - clReleaseCommandQueue(commandQueue); + if (commandQueue) { + status = clReleaseCommandQueue(commandQueue); + cl_errChk(status, "Oops!", true); + printf("clReleaseCommandQueue()\n"); } // Free the context - if(context) { - clReleaseContext(context); + if (context) { + status = clReleaseContext(context); + cl_errChk(status, "Oops!", true); + printf("clReleaseContext()\n"); + } + + for (int p = 0; p < numPlatforms; ++p) { + for (int d = 0; d < numDevices[p]; ++d) { + status = clReleaseDevice(devices[d]); + cl_errChk(status, "Oops!", true); + printf("clReleaseDevice()\n"); + } } free(devices); free(numDevices); - - // Free the platforms free(platforms); } @@ -464,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel) if(kernel != NULL) { status = clReleaseKernel(kernel); cl_errChk(status, "Releasing kernel object", true); + printf("clReleaseKernel()\n"); } } @@ -478,6 +490,7 @@ void cl_freeMem(cl_mem mem) if(mem != NULL) { status = clReleaseMemObject(mem); cl_errChk(status, "Releasing mem object", true); + printf("clReleaseMemObject()\n"); } } @@ -492,6 +505,7 @@ void cl_freeProgram(cl_program program) if(program != NULL) { status = clReleaseProgram(program); cl_errChk(status, "Releasing program object", true); + printf("clReleaseProgram()\n"); } } diff --git a/benchmarks/opencl/nearn/main.cc b/benchmarks/opencl/nearn/main.cc index 62d08c58..43ce1634 100755 --- a/benchmarks/opencl/nearn/main.cc +++ b/benchmarks/opencl/nearn/main.cc @@ -49,25 +49,27 @@ int main(int argc, char *argv[]) { printf("%s --> Distance=%f\n", records[i].recString, records[i].distance); } free(recordDistances); + + cl_cleanup(); + printf("Passed!\n"); + return 0; } float *OpenClFindNearestNeighbors(cl_context context, int numRecords, std::vector &locations, float lat, float lng, int timing) { - - // 1. set up kernel - cl_kernel NN_kernel; cl_int status; + + // 1. set up kernel + cl_kernel NN_kernel; cl_program cl_NN_program; cl_NN_program = cl_compileProgram((char *)"nearestNeighbor_kernel.cl", NULL); NN_kernel = clCreateKernel(cl_NN_program, "NearestNeighbor", &status); - status = - cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true); - if (status) - exit(1); + cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true); + // 2. set up memory on device and send ipts data to device // copy ipts(1,2) to device // also need to alloate memory for the distancePoints @@ -78,9 +80,11 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords, d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(LatLong) * numRecords, NULL, &error); + cl_errChk(error, "ERROR: clCreateBuffer() failed", true); d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * numRecords, NULL, &error); + cl_errChk(error, "ERROR: clCreateBuffer() failed", true); cl_command_queue command_queue = cl_getCommandQueue(); cl_event writeEvent, kernelEvent, readEvent; @@ -89,6 +93,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords, 0, // offset sizeof(LatLong) * numRecords, &locations[0], 0, NULL, &writeEvent); + cl_errChk(error, "ERROR: clEnqueueWriteBuffer() failed", true); // 3. send arguments to device cl_int argchk; @@ -124,8 +129,10 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords, &readEvent); cl_errChk(error, "ERROR with clEnqueueReadBuffer", true); - if (timing) { - clFinish(command_queue); + + clFinish(command_queue); + + if (timing) { cl_ulong eventStart, eventEnd, totalTime = 0; printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s) " "[size]\t\tTotal(s)\n"); @@ -166,8 +173,14 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords, printf("%f\n\n", (float)(totalTime / 1e9)); } // 6. return finalized data and release buffers - clReleaseMemObject(d_locations); - clReleaseMemObject(d_distances); + clReleaseEvent(writeEvent); + clReleaseEvent(kernelEvent); + clReleaseEvent(readEvent); + cl_freeMem(d_locations); + cl_freeMem(d_distances); + cl_freeKernel(NN_kernel); + cl_freeProgram(cl_NN_program); + return distances; } diff --git a/benchmarks/opencl/saxpy/Makefile b/benchmarks/opencl/saxpy/Makefile index 9d6f91b6..0414fcff 100644 --- a/benchmarks/opencl/saxpy/Makefile +++ b/benchmarks/opencl/saxpy/Makefile @@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) +OPTS ?= -n1024 + K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small" K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections" K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm" @@ -33,19 +35,19 @@ $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ run-fpga: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-asesim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-vlsim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-simx: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-rtlsim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) .depend: $(SRCS) $(CXX) $(CXXFLAGS) -MM $^ > .depend; diff --git a/benchmarks/opencl/saxpy/main.cc b/benchmarks/opencl/saxpy/main.cc index 7d64f03d..cf090486 100644 --- a/benchmarks/opencl/saxpy/main.cc +++ b/benchmarks/opencl/saxpy/main.cc @@ -29,11 +29,9 @@ #include #include #include +#include #include -//#define NUM_DATA 65536 -#define NUM_DATA 1024 - #define CL_CHECK(_expr) \ do { \ cl_int _err = _expr; \ @@ -85,14 +83,18 @@ uint8_t *kernel_bin = NULL; /// // Cleanup any created OpenCL resources // -void Cleanup(cl_context context, cl_command_queue commandQueue, - cl_program program, cl_kernel kernel, cl_mem memObjects[3]) { - for (int i = 0; i < 3; i++) { +void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue, + cl_program program, cl_kernel kernel, cl_mem memObjects[2]) { + if (kernel_bin) + free(kernel_bin); + + if (commandQueue != 0) + clReleaseCommandQueue(commandQueue); + + for (int i = 0; i < 2; i++) { if (memObjects[i] != 0) clReleaseMemObject(memObjects[i]); } - if (commandQueue != 0) - clReleaseCommandQueue(commandQueue); if (kernel != 0) clReleaseKernel(kernel); @@ -103,11 +105,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue, if (context != 0) clReleaseContext(context); - if (kernel_bin) free(kernel_bin); + if (device_id != 0) + clReleaseDevice(device_id); +} + +int size = 1024; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + printf("Workload size=%d\n", size); } int main(int argc, char **argv) { - printf("enter demo main\n"); + // parse command arguments + parse_args(argc, argv); cl_platform_id platform_id; cl_device_id device_id; @@ -126,7 +157,7 @@ int main(int argc, char **argv) { context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err)); cl_command_queue queue; - queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &_err)); + queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, NULL, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0, 0}; @@ -139,7 +170,7 @@ int main(int argc, char **argv) { context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); if (program == NULL) { std::cerr << "Failed to write program binary" << std::endl; - Cleanup(context, queue, program, kernel, memObjects); + Cleanup(device_id, context, queue, program, kernel, memObjects); return 1; } else { std::cout << "Read program from binary." << std::endl; @@ -148,7 +179,7 @@ int main(int argc, char **argv) { // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); - size_t nbytes = sizeof(float) * NUM_DATA; + size_t nbytes = sizeof(float) * size; printf("attempting to create input buffer\n"); cl_mem input_buffer; @@ -175,13 +206,13 @@ int main(int argc, char **argv) { printf("attempting to enqueue write buffer\n"); float* h_src = (float*)malloc(nbytes); - for (int i = 0; i < NUM_DATA; i++) { + for (int i = 0; i < size; i++) { h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0; } CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL)); free(h_src); - size_t global_work_size[] = {NUM_DATA/2, NUM_DATA/2}; + size_t global_work_size[] = {size/2, size/2}; printf("attempting to enqueue kernel\n"); auto time_start = std::chrono::high_resolution_clock::now(); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, @@ -196,18 +227,13 @@ int main(int argc, char **argv) { CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL)); /*printf("Result:"); - for (int i = 0; i < NUM_DATA; i++) { + for (int i = 0; i < size; i++) { float data = h_dst[i]; printf(" %f", data); }*/ free(h_dst); - CL_CHECK(clReleaseMemObject(memObjects[0])); - CL_CHECK(clReleaseMemObject(memObjects[1])); - - CL_CHECK(clReleaseKernel(kernel)); - CL_CHECK(clReleaseProgram(program)); - CL_CHECK(clReleaseContext(context)); + Cleanup(device_id, context, queue, program, kernel, memObjects); return 0; } diff --git a/benchmarks/opencl/sfilter/Makefile b/benchmarks/opencl/sfilter/Makefile index 62099e37..6a22e827 100644 --- a/benchmarks/opencl/sfilter/Makefile +++ b/benchmarks/opencl/sfilter/Makefile @@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) +OPTS ?= -n16 + K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small" K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections" K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm" @@ -33,19 +35,19 @@ $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ run-fpga: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-asesim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-vlsim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-simx: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) run-rtlsim: $(PROJECT) kernel.pocl - LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) .depend: $(SRCS) $(CXX) $(CXXFLAGS) -MM $^ > .depend; diff --git a/benchmarks/opencl/sfilter/main.cc b/benchmarks/opencl/sfilter/main.cc index 2f6992d1..17f4cea0 100644 --- a/benchmarks/opencl/sfilter/main.cc +++ b/benchmarks/opencl/sfilter/main.cc @@ -35,8 +35,6 @@ #include #include -#define NUM_DATA (16+2) - #define CL_CHECK(_expr) \ do { \ cl_int _err = _expr; \ @@ -159,14 +157,18 @@ float poclu_cl_half_to_float(cl_half value) { /// // Cleanup any created OpenCL resources // -void Cleanup(cl_context context, cl_command_queue commandQueue, - cl_program program, cl_kernel kernel, cl_mem memObjects[3]) { - for (int i = 0; i < 3; i++) { +void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue, + cl_program program, cl_kernel kernel, cl_mem memObjects[2]) { + if (kernel_bin) + free(kernel_bin); + + if (commandQueue != 0) + clReleaseCommandQueue(commandQueue); + + for (int i = 0; i < 2; i++) { if (memObjects[i] != 0) clReleaseMemObject(memObjects[i]); } - if (commandQueue != 0) - clReleaseCommandQueue(commandQueue); if (kernel != 0) clReleaseKernel(kernel); @@ -177,11 +179,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue, if (context != 0) clReleaseContext(context); - if (kernel_bin) free(kernel_bin); + if (device_id != 0) + clReleaseDevice(device_id); +} + +int size = 16+2; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg)+2; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + printf("Workload size=%d\n", size); } int main(int argc, char **argv) { - printf("enter demo main\n"); + // parse command arguments + parse_args(argc, argv); cl_platform_id platform_id; cl_device_id device_id; @@ -213,7 +244,7 @@ int main(int argc, char **argv) { context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); if (program == NULL) { std::cerr << "Failed to write program binary" << std::endl; - Cleanup(context, queue, program, kernel, memObjects); + Cleanup(device_id, context, queue, program, kernel, memObjects); return 1; } else { std::cout << "Read program from binary." << std::endl; @@ -222,7 +253,7 @@ int main(int argc, char **argv) { // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); - size_t nbytes = sizeof(float) * NUM_DATA * NUM_DATA; + size_t nbytes = sizeof(float) * size * size; printf("attempting to create input buffer\n"); cl_mem input_buffer; @@ -235,7 +266,7 @@ int main(int argc, char **argv) { memObjects[0] = input_buffer; memObjects[1] = output_buffer; - long long ldc = NUM_DATA; + long long ldc = size; float m0 = 1.0; float m1 = 1.0; @@ -265,15 +296,15 @@ int main(int argc, char **argv) { printf("attempting to enqueue write buffer\n"); float* h_src = (float*)malloc(nbytes); - for (int i = 0; i < NUM_DATA * NUM_DATA; i++) { + for (int i = 0; i < size * size; i++) { h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0; } CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL)); free(h_src); size_t global_offset[2] = {1, 1}; - size_t global_work_size[2] = {NUM_DATA - 2, NUM_DATA - 2}; // avoid the edges - const size_t local_work_size[2] = {NUM_DATA - 2, 1}; + size_t global_work_size[2] = {size - 2, size - 2}; // avoid the edges + const size_t local_work_size[2] = {size - 2, 1}; printf("attempting to enqueue kernel\n"); auto time_start = std::chrono::high_resolution_clock::now(); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset, @@ -286,20 +317,15 @@ int main(int argc, char **argv) { printf("Download destination buffer\n"); float* h_dst = (float*)malloc(nbytes); CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL)); - + /*printf("Result:"); - for (int i = 0; i < NUM_DATA * NUM_DATA; i++) { + for (int i = 0; i < size; i++) { float data = h_dst[i]; printf(" %f", data); }*/ free(h_dst); - CL_CHECK(clReleaseMemObject(memObjects[0])); - CL_CHECK(clReleaseMemObject(memObjects[1])); - - CL_CHECK(clReleaseKernel(kernel)); - CL_CHECK(clReleaseProgram(program)); - CL_CHECK(clReleaseContext(context)); + Cleanup(device_id, context, queue, program, kernel, memObjects); return 0; } diff --git a/benchmarks/opencl/sgemm/main.cc b/benchmarks/opencl/sgemm/main.cc index 1f92a14a..06893876 100644 --- a/benchmarks/opencl/sgemm/main.cc +++ b/benchmarks/opencl/sgemm/main.cc @@ -129,6 +129,8 @@ static void parse_args(int argc, char **argv) { printf("Error: invalid size!\n"); exit(-1); } + + printf("Workload size=%d\n", size); } int main (int argc, char **argv) { @@ -218,7 +220,8 @@ int main (int argc, char **argv) { matmul(h_ref, h_a, h_b, size, size, size); for (int i = 0; i < (size * size); i++) { if (!almost_equal(h_c[i], h_ref[i])) { - printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]); + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]); ++errors; } } diff --git a/benchmarks/opencl/vecadd/main.cc b/benchmarks/opencl/vecadd/main.cc index 14f35877..e38548bf 100644 --- a/benchmarks/opencl/vecadd/main.cc +++ b/benchmarks/opencl/vecadd/main.cc @@ -112,6 +112,8 @@ static void parse_args(int argc, char **argv) { exit(-1); } } + + printf("Workload size=%d\n", size); } int main (int argc, char **argv) { @@ -196,7 +198,8 @@ int main (int argc, char **argv) { for (int i = 0; i < size; ++i) { float ref = h_a[i] + h_b[i]; if (!almost_equal(h_c[i], ref)) { - printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]); + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]); ++errors; } } diff --git a/ci/blackbox.sh b/ci/blackbox.sh index d9d56b7e..002ba500 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -123,11 +123,15 @@ esac if [ -d "$VORTEX_HOME/driver/tests/$APP" ]; then APP_PATH=$VORTEX_HOME/driver/tests/$APP -else +elif [ -d "$VORTEX_HOME/benchmarks/opencl/$APP" ]; +then APP_PATH=$VORTEX_HOME/benchmarks/opencl/$APP +else + echo "Application folder found: $APP" + exit -1 fi -CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG" +CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG $CONFIGS" echo "CONFIGS=$CONFIGS" diff --git a/ci/regression.sh b/ci/regression.sh index eb00b259..0e0fb248 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -3,25 +3,62 @@ # exit when any command fails set -e +make -s + # Dogfood tests ./ci/test_runtime.sh ./ci/test_riscv_isa.sh ./ci/test_opencl.sh ./ci/test_driver.sh ./ci/test_simx.sh -./ci/test_compiler.sh -# Build tests disabling extensions +# warp/threads configurations +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=demo +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo + +# cores clustering +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=1 --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1" + +# L2/L3 +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1" + +# build flags +./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" + +# disabling M extension CONFIGS=-DEXT_M_DISABLE make -C hw/simulate + +# disabling F extension CONFIGS=-DEXT_F_DISABLE make -C hw/simulate # disable shared memory CONFIGS=-DSM_ENABLE=0 make -C hw/simulate -# Blackbox tests -./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" -./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" -./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=demo --args="-n1" -./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1" -./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" -./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1" \ No newline at end of file +# using FPNEW core +FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood + +# test 128-bit MEM block +CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo + +# test 128-bit MEM and DRAM block +CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo + +# test 27-bit DRAM address +CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo + +# test 128-bit DRAM block +CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo + +# test verilator reset values +CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm +CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm + +# test vlsim memory stress +CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm diff --git a/doc/Cache_Subsystem.md b/doc/Cache_Subsystem.md new file mode 100644 index 00000000..9de9aef7 --- /dev/null +++ b/doc/Cache_Subsystem.md @@ -0,0 +1,70 @@ +# Vortex Cache Subsystem + +The Vortex Cache Sub-system has the following main properties: + +- High-bandwidth with bank parallelism +- Snoop protocol to flush data for CPU access +- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache + +### Cache Hierarchy + +![Image of Cache Hierarchy](./Images/cache_hierarchy.png) + +- Cache can be configured to be any level in the hierarchy +- Caches communicate via snooping +- Cache flush from AFU is passed down the hierarchy​ + +### VX_cache.v (Top Module) + +VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory. + +![Image of Vortex Cache](./Images/vortex_cache_top_module.png) + +- Configurable (Cache size, number of banks, bank line size, etc.) +- I/O signals + - Core Request + - Core Rsp + - DRAM Req + - DRAM Rsp + - Snoop Rsp + - Snoop Rsp + - Snoop Forwarding Out + - Snoop Forwarding In +- Bank Select + - Assigns valid and ready signals for each bank +- Snoop Forwarder +- DRAM Request Arbiter + - Prepares cache response for communication with DRAM +- Snoop Response Arbiter + - Sends snoop response +- Core Response Merge + - Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID. + +### VX_bank.v + +VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory. + +![Image of Vortex Cache Bank](./Images/vortex_bank.png) + +- Allows for high throughput​ +- Each bank contains queues to hold requests to the cache​ +- I/O signals + - Core request​ + - Core Response​ + - DRAM Fill Requests​ + - DRAM Fill Response​ + - DRAM WB Requests​ + - Snp Request​ + - Snp Response +- Request Priority: DRAM fill, miss reserve, core request, snoop request​ +- Snoop Request Queue​ +- DRAM Fill Queue​ +- Core Req Arbiter​ + - Requests to be processed by the bank +- Tag Data Store​ + - Registers for valid, dirty, dirtyb, tag, and data​ + - Length of registers determined by lines in the bank​ +- Tag Data Access:​ + - I/O: stall, snoop info, force request miss + - Writes to cache or sends read response; hit or miss determined here + - A missed request goes to the miss reserve if it is not a snoop request or DRAM fill \ No newline at end of file diff --git a/doc/Codebase.md b/doc/Codebase.md new file mode 100644 index 00000000..3d018855 --- /dev/null +++ b/doc/Codebase.md @@ -0,0 +1,35 @@ +# Vortex Codebase + +The directory/file layout of the Vortex codebase is as followed: + +- `benchmark`: contains opencl, risc-v, and vector tests + - `opencl`: contains basic kernel operation tests (i.e. vector add, transpose, dot product) + - `riscv`: contains official riscv tests which are pre-compiled into binaries + - `vector`: tests for vector instructions (not yet implemented) +- `ci`: contain tests to be run during continuous integration (Travis CI) + - driver, opencl, riscv_isa, and runtime tests +- `driver`: contains driver software implementation (software that is run on the host to communicate with the vortex processor) + - `opae`: contains code for driver that runs on FPGA + - `rtlsim`: contains code for driver that runs on local machine (driver built using verilator which converts rtl to c++ binary) + - `simx`: contains code for driver that runs on local machine (vortex) + - `include`: contains vortex.h which has the vortex API that is used by the drivers +- `runtime`: contains software used inside kernel programs to expose GPGPU capabilities + - `include`: contains vortex API needed for runtime + - `linker`: contains linker file for compiling kernels + - `src`: contains implementation of vortex API (from include folder) + - `tests`: contains runtime tests + - `simple`: contains test for GPGPU functionality allowed in vortex +- `simx`: contains simX, the cycle approximate simulator for vortex +- `miscs`: contains old code that is no longer used +- `hw`: + - `unit_tests`: contains unit test for RTL of cache and queue + - `syn`: contains all synthesis scripts (quartus and yosys) + - `quartus`: contains code to synthesis cache, core, pipeline, top, and vortex stand-alone + - `simulate`: contains RTL simulator (verilator) + - `testbench.cpp`: runs either the riscv, runtime, or opencl tests + - `opae`: contains source code for the accelerator functional unit (AFU) and code which programs the fpga + - `rtl`: contains rtl source code + - `cache`: contains cache subsystem code + - `fp_cores`: contains floating point unit code + - `interfaces`: contains code that handles communication for each of the units of the microarchitecture + - `libs`: contains general-purpose modules (i.e., buffers, encoders, arbiters, pipe registers) \ No newline at end of file diff --git a/doc/Images/cache_hierarchy.png b/doc/Images/cache_hierarchy.png new file mode 100644 index 00000000..876f5fe2 Binary files /dev/null and b/doc/Images/cache_hierarchy.png differ diff --git a/doc/Images/vortex_bank.png b/doc/Images/vortex_bank.png new file mode 100644 index 00000000..8c10995e Binary files /dev/null and b/doc/Images/vortex_bank.png differ diff --git a/doc/Images/vortex_cache_top_module.png b/doc/Images/vortex_cache_top_module.png new file mode 100644 index 00000000..ecb8be98 Binary files /dev/null and b/doc/Images/vortex_cache_top_module.png differ diff --git a/doc/Images/vortex_microarchitecture_v2.png b/doc/Images/vortex_microarchitecture_v2.png new file mode 100644 index 00000000..c0e85891 Binary files /dev/null and b/doc/Images/vortex_microarchitecture_v2.png differ diff --git a/doc/Microarchitecture.md b/doc/Microarchitecture.md new file mode 100644 index 00000000..1b410066 --- /dev/null +++ b/doc/Microarchitecture.md @@ -0,0 +1,94 @@ +# Vortex Microarchitecture + +### Vortex GPGPU Execution Model + +Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with a single warp issued per cycle. + +- **Threads** + - Smallest unit of computation + - Each thread has its own register file (32 int + 32 fp registers) + - Threads execute in parallel +- **Warps** + - A logical clster of threads + - Each thread in a warp execute the same instruction + - The PC is shared; maintain thread mask for Writeback + - Warp's execution is time-multiplexed at log steps + - Ex. warp 0 executes at cycle 0, warp 1 executes at cycle 1 + +### Vortex RISC-V ISA Extension + +- **Thread Mask Control** + - Control the number of warps to activate during execution + - `TMC` *count*: activate count threads +- **Warp Scheduling** + - Control the number of warps to activate during execution + - `WSPAWN` *count, addr*: activate count warps and jump to addr location +- **Control-Flow Divergence** + - Control threads to activate when a branch diverges + - `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack + - `JOIN`: restore 'not-taken' thread mask +- **Warp Synchronization** + - `BAR` *id, count*: stall warps entering barrier *id* until count is reached + +### Vortex Pipeline/Datapath + +![Image of Vortex Microarchitecture](./Images/vortex_microarchitecture_v2.png) + +Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB. + +- **Fetch** + - Warp Scheduler + - Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack + - Instruction Cache + - Retrieve instruction from cache, issue I-cache requests/responses +- **Decode** + - Decode fetched instructions, notify warp scheduler when the following instructions are decoded: + - Branch, tmc, split/join, wspawn + - Precompute used_regs mask (needed for Issue stage) +- **Issue** + - Scheduling + - In-order issue (operands/execute unit ready), out-of-order commit + - IBuffer + - Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling + - Scoreboard + - Track in-use registers + - GPRs (General-Purpose Registers) stage + - Fetch issued instruction operands and send operands to execute unit +- **Execute** + - ALU Unit + - Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources) + - MULDIV Unit + - Multiplier - done in 2 cycles + - Divider - division and remainder, done in 32 cycles + - Implements serial alogrithm (Stalls the pipeline) + - FPU Unit + - Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA + - CSR Unit + - Store constant status registers - device caps, FPU status flags, performance counters + - Handle external CSR requests (requests from host CPU) + - LSU Unit + - Handle load/store operations, issue D-cache requests, handle D-cache responses + - Commit load responses - saves storage, Scoreboard tracks completion + - GPGPU Unit + - Handle GPGPU instructions + - TMC, WSPAWN, SPLIT, BAR + - JOIN is handled by Warp Scheduler (upon SPLIT response) +- **Commit** + - Commit + - Update CSR flags, update performance counters + - Writeback + - Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority) +- **Clustering** + - Group mulitple cores into clusters (optionally share L2 cache) + - Group multiple clusters (optionally share L3 cache) + - Configurable at build time + - Default configuration: + - #Clusters = 1 + - #Cores = 4 + - #Warps = 4 + - #Threads = 4 +- **FPGA AFU Interface** + - Manage CPU-GPU comunication + - Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers + - Local Memory - GPU access to local DRAM + - Reserved I/O addresses - redirect to host CPU, console output \ No newline at end of file diff --git a/doc/Simulation.md b/doc/Simulation.md index b6861628..dfd042bb 100644 --- a/doc/Simulation.md +++ b/doc/Simulation.md @@ -24,10 +24,9 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script - *L3cache* - used to enable the shared l3cache among the Vortex clusters. - *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, vlsim, fpga, or simx). - *Debug* - used to enable debug mode for the Vortex simulation. -- *Scope* - -- *Perf* - is used to enable the detailed performance counters within the Vortex simulation. -- *App* - is used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex. -- *Args* - +- *Perf* - used to enable the detailed performance counters within the Vortex simulation. +- *App* - used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex. +- *Args* - used to pass additional arguments to the application. Example use of command line arguments: Run the sgemm benchmark using the vlsim driver with a Vortex configuration of 1 cluster, 4 cores, 4 warps, and 4 threads. diff --git a/doc/Vortex.md b/doc/Vortex.md index 36846b30..97ff40a2 100644 --- a/doc/Vortex.md +++ b/doc/Vortex.md @@ -2,10 +2,12 @@ ### Table of Contents -- Vortex Architecture +- [Vortex Codebase Layout](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Codebase.md) +- [Vortex Microarchitecture and Extended RISC-V ISA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Microarchitecture.md) +- [Vortex Cache Subsystem](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Cache_Subsystem.md) - Vortex Software - [Vortex Simulation](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Simulation.md) -- [FPGA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md) +- [FPGA Configuration, Program and Test](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md) - Debugging - Useful Links diff --git a/driver/Makefile b/driver/Makefile index 58de93a2..e015fbf5 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -1,4 +1,4 @@ -all: stub rtlsim simx opae +all: stub rtlsim simx opae tests stub: $(MAKE) -C stub @@ -12,10 +12,14 @@ rtlsim: simx: $(MAKE) -C simx +tests: + $(MAKE) -C tests + clean: $(MAKE) clean -C stub $(MAKE) clean -C opae $(MAKE) clean -C rtlsim $(MAKE) clean -C simx + $(MAKE) clean -C tests -.PHONY: all stub opae rtlsim simx clean \ No newline at end of file +.PHONY: all stub opae rtlsim simx tests clean \ No newline at end of file diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index c265002d..754cea4b 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -33,6 +33,13 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ while (offset < size) { auto chunk_size = std::min(buffer_transfer_size, size - offset); std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size); + + /*printf("** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset); + for (int i = 0, n = ((chunk_size+7)/8); i < n; ++i) { + printf("%08x", ((uint64_t*)((uint8_t*)content + offset))[n-1-i]); + } + printf("\n");*/ + err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0); if (err != 0) { vx_buf_release(buffer); @@ -115,10 +122,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t smem_writes = 0; uint64_t smem_bank_stalls = 0; // PERF: memory - uint64_t dram_reads = 0; - uint64_t dram_writes = 0; - uint64_t dram_stalls = 0; - uint64_t dram_lat = 0; + uint64_t mem_reads = 0; + uint64_t mem_writes = 0; + uint64_t mem_stalls = 0; + uint64_t mem_lat = 0; #endif for (unsigned core_id = 0; core_id < num_cores; ++core_id) { @@ -255,21 +262,21 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization); smem_bank_stalls += smem_bank_st_per_core; - // PERF: DRAM - uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core); - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core); - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core); - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core); - int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100); - int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core)); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat); - dram_reads += dram_reads_per_core; - dram_writes += dram_writes_per_core; - dram_stalls += dram_stalls_per_core; - dram_lat += dram_lat_per_core; + // PERF: memory + uint64_t mem_reads_per_core, mem_writes_per_core, mem_stalls_per_core, mem_lat_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_READS, CSR_MPM_MEM_READS_H, &mem_reads_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_WRITES, CSR_MPM_MEM_WRITES_H, &mem_writes_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_ST, CSR_MPM_MEM_ST_H, &mem_stalls_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_LAT, CSR_MPM_MEM_LAT_H, &mem_lat_per_core); + int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100); + int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat); + mem_reads += mem_reads_per_core; + mem_writes += mem_writes_per_core; + mem_stalls += mem_stalls_per_core; + mem_lat += mem_lat_per_core; #endif } @@ -282,8 +289,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); - int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100); - int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads)); + int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100); + int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); @@ -306,9 +313,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); - fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes); - fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization); - fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat); + fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); + fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization); + fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat); #endif return ret; diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index 932dac05..d531ca13 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -1,8 +1,7 @@ CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors -CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized - +CFLAGS += -DUSE_VLSIM -fPIC -Wno-maybe-uninitialized CFLAGS += -I../../../../hw # control RTL debug print states @@ -13,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM +DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE @@ -22,15 +21,9 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO -#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1 -CFLAGS += -fPIC - -CFLAGS += -DUSE_VLSIM $(CONFIGS) - +CFLAGS += $(CONFIGS) CFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread @@ -49,10 +42,11 @@ TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip -VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) -VL_FLAGS += -Wno-DECLFILENAME +VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic +VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += verilator.vlt +VL_FLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') @@ -83,16 +77,20 @@ endif VL_FLAGS += -DNOPAE CFLAGS += -DNOPAE -# use DPI FPU -VL_FLAGS += -DFPU_DPI +# FPU backend +FPU_CORE ?= FPU_DPI +VL_FLAGS += -D$(FPU_CORE) PROJECT = libopae-c-vlsim.so all: $(PROJECT) + +vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh + ../../../hw/scripts/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h -$(PROJECT): $(SRCS) +$(PROJECT): $(SRCS) vortex_afu.h verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT) make -j -C obj_dir -f V$(TOP).mk clean: - rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh + rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh vortex_afu.h diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index 0844ee17..9f8df01d 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -10,10 +10,23 @@ #define RESET_DELAY 4 -#define ENABLE_DRAM_STALLS -#define DRAM_LATENCY 24 -#define DRAM_RQ_SIZE 16 -#define DRAM_STALLS_MODULO 16 +#define ENABLE_MEM_STALLS + +#ifndef MEM_LATENCY +#define MEM_LATENCY 24 +#endif + +#ifndef MEM_RQ_SIZE +#define MEM_RQ_SIZE 16 +#endif + +#ifndef MEM_STALLS_MODULO +#define MEM_STALLS_MODULO 16 +#endif + +#ifndef VERILATOR_RESET_VALUE +#define VERILATOR_RESET_VALUE 2 +#endif uint64_t timestamp = 0; @@ -23,7 +36,7 @@ double sc_time_stamp() { opae_sim::opae_sim() { // force random values for unitialized signals - Verilated::randReset(2); + Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); // Turn off assertion before reset @@ -137,16 +150,19 @@ void opae_sim::flush() { void opae_sim::reset() { - host_buffers_.clear(); - dram_reads_.clear(); + host_buffers_.clear(); cci_reads_.clear(); cci_writes_.clear(); vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0; vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0; vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = 0; vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = 0; - vortex_afu_->avs_readdatavalid = 0; - vortex_afu_->avs_waitrequest = 0; + + for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) { + mem_reads_[b].clear(); + vortex_afu_->avs_readdatavalid[b] = 0; + vortex_afu_->avs_waitrequest[b] = 0; + } vortex_afu_->reset = 1; @@ -268,84 +284,89 @@ void opae_sim::sTxPort_bus() { } void opae_sim::avs_bus() { - // update DRAM responses schedule - for (auto& rsp : dram_reads_) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - - // schedule DRAM responses in FIFO order - std::list::iterator dram_rd_it(dram_reads_.end()); - if (!dram_reads_.empty() - && (0 == dram_reads_.begin()->cycles_left)) { - dram_rd_it = dram_reads_.begin(); - } - - // send DRAM response - vortex_afu_->avs_readdatavalid = 0; - if (dram_rd_it != dram_reads_.end()) { - vortex_afu_->avs_readdatavalid = 1; - memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE); - uint32_t addr = dram_rd_it->addr; - dram_reads_.erase(dram_rd_it); - /*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE); - for (auto& req : dram_reads_) { - if (req.cycles_left != 0) - printf(" !%0x", req.addr * CACHE_BLOCK_SIZE); - else - printf(" %0x", req.addr * CACHE_BLOCK_SIZE); + for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) { + // update memory responses schedule + for (auto& rsp : mem_reads_[b]) { + if (rsp.cycles_left > 0) + rsp.cycles_left -= 1; } - printf("}\n");*/ - } - // handle DRAM stalls - bool dram_stalled = false; -#ifdef ENABLE_DRAM_STALLS - if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) { - dram_stalled = true; - } else - if (dram_reads_.size() >= DRAM_RQ_SIZE) { - dram_stalled = true; - } -#endif - - // process DRAM requests - if (!dram_stalled) { - assert(!vortex_afu_->avs_read || !vortex_afu_->avs_write); - if (vortex_afu_->avs_write) { - assert(0 == vortex_afu_->mem_bank_select); - uint64_t byteen = vortex_afu_->avs_byteenable; - unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE); - uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata); - for (int i = 0; i < CACHE_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - ram_[base_addr + i] = data[i]; - } - } + // schedule memory responses in FIFO order + std::list::iterator mem_rd_it(mem_reads_[b].end()); + if (!mem_reads_[b].empty() + && (0 == mem_reads_[b].begin()->cycles_left)) { + mem_rd_it = mem_reads_[b].begin(); } - if (vortex_afu_->avs_read) { - assert(0 == vortex_afu_->mem_bank_select); - dram_rd_req_t dram_req; - dram_req.addr = vortex_afu_->avs_address; - ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data()); - dram_req.cycles_left = DRAM_LATENCY; - for (auto& rsp : dram_reads_) { - if (dram_req.addr == rsp.addr) { - dram_req.cycles_left = rsp.cycles_left; - break; - } - } - dram_reads_.emplace_back(dram_req); - /*printf("%0ld: [sim] DRAM Rd Req: addr=%x, pending={", timestamp, dram_req.addr * CACHE_BLOCK_SIZE); - for (auto& req : dram_reads_) { + + // send memory response + vortex_afu_->avs_readdatavalid[b] = 0; + if (mem_rd_it != mem_reads_[b].end()) { + vortex_afu_->avs_readdatavalid[b] = 1; + memcpy(vortex_afu_->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE); + uint32_t addr = mem_rd_it->addr; + mem_reads_[b].erase(mem_rd_it); + /*printf("%0ld: [sim] MEM Rd Rsp: addr=%x, pending={", timestamp, addr * MEM_BLOCK_SIZE); + for (auto& req : mem_reads_[b]) { if (req.cycles_left != 0) - printf(" !%0x", req.addr * CACHE_BLOCK_SIZE); + printf(" !%0x", req.addr * MEM_BLOCK_SIZE); else - printf(" %0x", req.addr * CACHE_BLOCK_SIZE); + printf(" %0x", req.addr * MEM_BLOCK_SIZE); } printf("}\n");*/ - } - } + } - vortex_afu_->avs_waitrequest = dram_stalled; + // handle memory stalls + bool mem_stalled = false; + #ifdef ENABLE_MEM_STALLS + if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { + mem_stalled = true; + } else + if (mem_reads_[b].size() >= MEM_RQ_SIZE) { + mem_stalled = true; + } + #endif + + // process memory requests + if (!mem_stalled) { + assert(!vortex_afu_->avs_read[b] || !vortex_afu_->avs_write[b]); + if (vortex_afu_->avs_write[b]) { + uint64_t byteen = vortex_afu_->avs_byteenable[b]; + unsigned base_addr = vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE; + uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata[b]); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + ram_[base_addr + i] = data[i]; + } + } + /*printf("%0ld: [sim] MEM Wr Req: addr=%x, data=", timestamp, base_addr); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%0x", data[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n");*/ + } + if (vortex_afu_->avs_read[b]) { + mem_rd_req_t mem_req; + mem_req.addr = vortex_afu_->avs_address[b]; + ram_.read(vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data.data()); + mem_req.cycles_left = MEM_LATENCY; + for (auto& rsp : mem_reads_[b]) { + if (mem_req.addr == rsp.addr) { + mem_req.cycles_left = rsp.cycles_left; + break; + } + } + mem_reads_[b].emplace_back(mem_req); + /*printf("%0ld: [sim] MEM Rd Req: addr=%x, pending={", timestamp, mem_req.addr * MEM_BLOCK_SIZE); + for (auto& req : mem_reads_[b]) { + if (req.cycles_left != 0) + printf(" !%0x", req.addr * MEM_BLOCK_SIZE); + else + printf(" %0x", req.addr * MEM_BLOCK_SIZE); + } + printf("}\n");*/ + } + } + + vortex_afu_->avs_waitrequest[b] = mem_stalled; + } } \ No newline at end of file diff --git a/driver/opae/vlsim/opae_sim.h b/driver/opae/vlsim/opae_sim.h index ad08019e..e8ecd4a3 100644 --- a/driver/opae/vlsim/opae_sim.h +++ b/driver/opae/vlsim/opae_sim.h @@ -1,14 +1,16 @@ #pragma once +#include "verilated.h" +//#include "verilated_stub.h" #include "Vvortex_afu_shim.h" #include "Vvortex_afu_shim__Syms.h" -#include "verilated.h" #ifdef VCD_OUTPUT #include #endif #include +#include "vortex_afu.h" #include "ram.h" #include @@ -16,7 +18,10 @@ #include #include -#define CACHE_BLOCK_SIZE 64 +#undef MEM_BLOCK_SIZE +#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) + +#define CACHE_BLOCK_SIZE 64 class opae_sim { public: @@ -40,9 +45,9 @@ private: typedef struct { int cycles_left; - std::array data; + std::array data; uint32_t addr; - } dram_rd_req_t; + } mem_rd_req_t; typedef struct { int cycles_left; @@ -77,7 +82,7 @@ private: std::unordered_map host_buffers_; - std::list dram_reads_; + std::list mem_reads_ [PLATFORM_PARAM_LOCAL_MEMORY_BANKS]; std::list cci_reads_; diff --git a/driver/opae/vlsim/vortex_afu_shim.sv b/driver/opae/vlsim/vortex_afu_shim.sv index 0bcabd03..9255bfaa 100644 --- a/driver/opae/vlsim/vortex_afu_shim.sv +++ b/driver/opae/vlsim/vortex_afu_shim.sv @@ -1,13 +1,16 @@ -`include "VX_define.vh" +`include "VX_platform.vh" +`IGNORE_WARNINGS_BEGIN `include "vortex_afu.vh" +`IGNORE_WARNINGS_END + /* verilator lint_off IMPORTSTAR */ import ccip_if_pkg::*; import local_mem_cfg_pkg::*; -/* verilator lint_on IMPORTSTAR */ +/* verilator lint_on IMPORTSTAR */ -module vortex_afu_shim #( - parameter NUM_LOCAL_MEM_BANKS = 2 -) ( +`include "VX_define.vh" + +module vortex_afu_shim ( // global signals input clk, input reset, @@ -69,24 +72,22 @@ module vortex_afu_shim #( output t_ccip_mmioData af2cp_sTxPort_c2_data, // Avalon signals for local memory access - output t_local_mem_data avs_writedata, - input t_local_mem_data avs_readdata, - output t_local_mem_addr avs_address, - input logic avs_waitrequest, - output logic avs_write, - output logic avs_read, - output t_local_mem_byte_mask avs_byteenable, - output t_local_mem_burst_cnt avs_burstcount, - input avs_readdatavalid, - - output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select + output t_local_mem_data avs_writedata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + input t_local_mem_data avs_readdata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + output t_local_mem_addr avs_address [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + input logic avs_waitrequest [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + output logic avs_write [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + output logic avs_read [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + output t_local_mem_byte_mask avs_byteenable [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], + input avs_readdatavalid [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS] ); t_if_ccip_Rx cp2af_sRxPort; t_if_ccip_Tx af2cp_sTxPort; vortex_afu #( - .NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS) + .NUM_LOCAL_MEM_BANKS(`PLATFORM_PARAM_LOCAL_MEMORY_BANKS) ) afu ( .clk(clk), .reset(reset), @@ -100,8 +101,7 @@ vortex_afu #( .avs_read(avs_read), .avs_byteenable(avs_byteenable), .avs_burstcount(avs_burstcount), - .avs_readdatavalid(avs_readdatavalid), - .mem_bank_select(mem_bank_select) + .avs_readdatavalid(avs_readdatavalid) ); t_if_ccip_c0_RxHdr c0_RxHdr; diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 78f1741c..de96a83c 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -1,33 +1,29 @@ CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors -CFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized +CFLAGS += -DUSE_RTLSIM -fPIC -Wno-maybe-uninitialized CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw # control RTL debug print states DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE -#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK -#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR -#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG -#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -#DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM -#DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -#DBG_PRINT_FLAGS += -DDBG_PRINT_AVS -#DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA +DBG_PRINT_FLAGS += -DDBG_PRINT_MEM +DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE +DBG_PRINT_FLAGS += -DDBG_PRINT_AVS +DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO -#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1 CFLAGS += $(CONFIGS) - CFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread @@ -45,10 +41,11 @@ FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) -VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) -VL_FLAGS += -Wno-DECLFILENAME +VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic +VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += verilator.vlt +VL_FLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') @@ -69,8 +66,9 @@ ifdef PERF CFLAGS += -DPERF_ENABLE endif -# use DPI FPU -VL_FLAGS += -DFPU_DPI +# FPU backend +FPU_CORE ?= FPU_DPI +VL_FLAGS += -D$(FPU_CORE) PROJECT = libvortex.so # PROJECT = libvortex.dylib diff --git a/driver/simx/Makefile b/driver/simx/Makefile index fb3a23e2..154404d0 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -6,16 +6,13 @@ SIMX_DIR = ../../simX CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors -CXXFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized +CXXFLAGS += -DUSE_SIMX -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -CXXFLAGS += -DDUMP_PERF_STATS -#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1 CXXFLAGS += $(CONFIGS) +CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread #LDFLAGS += -dynamiclib -pthread diff --git a/driver/tests/Makefile b/driver/tests/Makefile index 5e4b3382..40394bc9 100644 --- a/driver/tests/Makefile +++ b/driver/tests/Makefile @@ -2,19 +2,23 @@ all: $(MAKE) -C basic $(MAKE) -C demo $(MAKE) -C dogfood + $(MAKE) -C stress run: $(MAKE) -C basic run-vlsim $(MAKE) -C demo run-vlsim $(MAKE) -C dogfood run-vlsim + $(MAKE) -C stress run-vlsim clean: $(MAKE) -C basic clean $(MAKE) -C demo clean $(MAKE) -C dogfood clean + $(MAKE) -C stress clean clean-all: $(MAKE) -C basic clean-all $(MAKE) -C demo clean-all $(MAKE) -C dogfood clean-all + $(MAKE) -C stress clean-all diff --git a/driver/tests/basic/main.cpp b/driver/tests/basic/main.cpp index fb232c7c..586173eb 100755 --- a/driver/tests/basic/main.cpp +++ b/driver/tests/basic/main.cpp @@ -23,7 +23,7 @@ int test = -1; uint32_t count = 0; vx_device_h device = nullptr; -vx_buffer_h buffer = nullptr; +vx_buffer_h staging_buf = nullptr; static void show_usage() { std::cout << "Vortex Driver Test." << std::endl; @@ -56,8 +56,8 @@ static void parse_args(int argc, char **argv) { } void cleanup() { - if (buffer) { - vx_buf_release(buffer); + if (staging_buf) { + vx_buf_release(staging_buf); } if (device) { vx_dev_close(device); @@ -77,38 +77,38 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) { // update source buffer for (int i = 0; i < num_blocks_8; ++i) { - ((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value); + ((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value); } /*for (int i = 0; i < num_blocks; ++i) { std::cout << "data[" << i << "]=0x"; for (int j = 7; j >= 0; --j) { - std::cout << std::hex << ((uint64_t*)vx_host_ptr(buffer))[i * 8 +j]; + std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j]; } std::cout << std::endl; }*/ - // write buffer to local memory - std::cout << "write buffer to local memory" << std::endl; + // write source buffer to local memory + std::cout << "write source buffer to local memory" << std::endl; auto t0 = std::chrono::high_resolution_clock::now(); - RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0)); auto t1 = std::chrono::high_resolution_clock::now(); // clear destination buffer for (int i = 0; i < num_blocks_8; ++i) { - ((uint64_t*)vx_host_ptr(buffer))[i] = 0; + ((uint64_t*)vx_host_ptr(staging_buf))[i] = 0; } - // read buffer from local memory - std::cout << "read buffer from local memory" << std::endl; + // read destination buffer from local memory + std::cout << "read destination buffer from local memory" << std::endl; auto t2 = std::chrono::high_resolution_clock::now(); - RT_CHECK(vx_copy_from_dev(buffer, dev_addr, 64 * num_blocks, 0)); + RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0)); auto t3 = std::chrono::high_resolution_clock::now(); // verify result std::cout << "verify result" << std::endl; for (int i = 0; i < num_blocks_8; ++i) { - auto curr = ((uint64_t*)vx_host_ptr(buffer))[i]; + auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i]; auto ref = shuffle(i, value); if (curr != ref) { std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i) @@ -145,25 +145,25 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, // update source buffer { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { buf_ptr[i] = i; } } std::cout << "upload source buffer" << std::endl; auto t0 = std::chrono::high_resolution_clock::now(); - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0)); auto t1 = std::chrono::high_resolution_clock::now(); // clear destination buffer { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { buf_ptr[i] = 0xdeadbeef; } } std::cout << "clear destination buffer" << std::endl; - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); // start device std::cout << "start execution" << std::endl; @@ -172,17 +172,17 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, RT_CHECK(vx_ready_wait(device, -1)); auto t3 = std::chrono::high_resolution_clock::now(); - // read buffer from local memory - std::cout << "read buffer from local memory" << std::endl; + // read destination buffer from local memory + std::cout << "read destination buffer from local memory" << std::endl; auto t4 = std::chrono::high_resolution_clock::now(); - RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); auto t5 = std::chrono::high_resolution_clock::now(); // verify result std::cout << "verify result" << std::endl; for (uint32_t i = 0; i < num_points; ++i) { - int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i]; + int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i]; int32_t ref = i; if (curr != ref) { std::cout << "error at result #" << i @@ -233,8 +233,8 @@ int main(int argc, char *argv[]) { unsigned max_cores; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); uint32_t num_points = 1 * count; - uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64; - uint32_t buf_size = num_blocks * 64; + uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64; + uint32_t buf_size = num_blocks * 64; std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; @@ -253,7 +253,7 @@ int main(int argc, char *argv[]) { // allocate shared memory std::cout << "allocate shared memory" << std::endl; uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf)); // run tests if (0 == test || -1 == test) { @@ -269,9 +269,9 @@ int main(int argc, char *argv[]) { // upload kernel argument std::cout << "upload kernel argument" << std::endl; { - auto buf_ptr = (void*)vx_host_ptr(buffer); + auto buf_ptr = (void*)vx_host_ptr(staging_buf); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } std::cout << "run kernel test" << std::endl; diff --git a/driver/tests/demo/main.cpp b/driver/tests/demo/main.cpp index 41483458..7261869c 100644 --- a/driver/tests/demo/main.cpp +++ b/driver/tests/demo/main.cpp @@ -20,7 +20,7 @@ const char* kernel_file = "kernel.bin"; uint32_t count = 0; vx_device_h device = nullptr; -vx_buffer_h buffer = nullptr; +vx_buffer_h staging_buf = nullptr; static void show_usage() { std::cout << "Vortex Driver Test." << std::endl; @@ -50,8 +50,8 @@ static void parse_args(int argc, char **argv) { } void cleanup() { - if (buffer) { - vx_buf_release(buffer); + if (staging_buf) { + vx_buf_release(staging_buf); } if (device) { vx_dev_close(device); @@ -71,13 +71,13 @@ int run_test(const kernel_arg_t& kernel_arg, // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); // verify result std::cout << "verify result" << std::endl; { int errors = 0; - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { int ref = i + i; int cur = buf_ptr[i]; @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) { uint32_t num_tasks = max_cores * max_warps * max_threads; uint32_t num_points = count * num_tasks; - uint32_t buf_size = num_points * sizeof(uint32_t); + uint32_t buf_size = num_points * sizeof(int32_t); std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; @@ -148,45 +148,45 @@ int main(int argc, char *argv[]) { // allocate shared memory std::cout << "allocate shared memory" << std::endl; uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; { - auto buf_ptr = (int*)vx_host_ptr(buffer); + auto buf_ptr = (int*)vx_host_ptr(staging_buf); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } // upload source buffer0 { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { buf_ptr[i] = i-1; } } std::cout << "upload source buffer0" << std::endl; - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_ptr, buf_size, 0)); // upload source buffer1 { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { buf_ptr[i] = i+1; } } std::cout << "upload source buffer1" << std::endl; - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_ptr, buf_size, 0)); // clear destination buffer { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); for (uint32_t i = 0; i < num_points; ++i) { buf_ptr[i] = 0xdeadbeef; } } std::cout << "clear destination buffer" << std::endl; - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); // run tests std::cout << "run tests" << std::endl; diff --git a/driver/tests/stress/Makefile b/driver/tests/stress/Makefile new file mode 100644 index 00000000..59fd181b --- /dev/null +++ b/driver/tests/stress/Makefile @@ -0,0 +1,67 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_RT_PATH ?= $(wildcard ../../../runtime) + +OPTS ?= -n64 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a + +VX_SRCS = kernel.c + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I../../include + +PROJECT = stress + +SRCS = main.cpp + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@ + +run-fpga: $(PROJECT) + LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) + LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) + LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-simx: $(PROJECT) + LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/driver/tests/stress/common.h b/driver/tests/stress/common.h new file mode 100644 index 00000000..843a4a4c --- /dev/null +++ b/driver/tests/stress/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +#define NUM_LOADS 8 + +struct kernel_arg_t { + uint32_t num_tasks; + uint32_t size; + uint32_t stride; + uint32_t addr_ptr; + uint32_t src_ptr; + uint32_t dst_ptr; +}; + +#endif \ No newline at end of file diff --git a/driver/tests/stress/kernel.bin b/driver/tests/stress/kernel.bin new file mode 100755 index 00000000..9aec1063 Binary files /dev/null and b/driver/tests/stress/kernel.bin differ diff --git a/driver/tests/stress/kernel.c b/driver/tests/stress/kernel.c new file mode 100644 index 00000000..c40cb11c --- /dev/null +++ b/driver/tests/stress/kernel.c @@ -0,0 +1,29 @@ +#include +#include +#include +#include "common.h" + +void kernel_body(int task_id, void* arg) { + struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg); + uint32_t stride = _arg->stride; + uint32_t* addr_ptr = (uint32_t*)_arg->addr_ptr; + float* src_ptr = (float*)_arg->src_ptr; + float* dst_ptr = (float*)_arg->dst_ptr; + + uint32_t offset = task_id * stride; + + for (uint32_t i = 0; i < stride; ++i) { + float value = 0.0f; + for (uint32_t j = 0; j < NUM_LOADS; ++j) { + uint32_t addr = offset + i + j; + uint32_t index = addr_ptr[addr]; + value *= src_ptr[index]; + } + dst_ptr[offset+i] = value; + } +} + +void main() { + struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, kernel_body, arg); +} \ No newline at end of file diff --git a/driver/tests/stress/kernel.dump b/driver/tests/stress/kernel.dump new file mode 100644 index 00000000..21e810bf --- /dev/null +++ b/driver/tests/stress/kernel.dump @@ -0,0 +1,596 @@ + +kernel.elf: file format elf32-littleriscv + + +Disassembly of section .init: + +80000000 <_start>: +80000000: 00000597 auipc a1,0x0 +80000004: 17058593 addi a1,a1,368 # 80000170 +80000008: fc102573 csrr a0,0xfc1 +8000000c: 00b5106b 0xb5106b +80000010: 160000ef jal ra,80000170 +80000014: 00100513 li a0,1 +80000018: 0005006b 0x5006b +8000001c: 00002517 auipc a0,0x2 +80000020: ba850513 addi a0,a0,-1112 # 80001bc4 +80000024: 00002617 auipc a2,0x2 +80000028: c2060613 addi a2,a2,-992 # 80001c44 <__BSS_END__> +8000002c: 40a60633 sub a2,a2,a0 +80000030: 00000593 li a1,0 +80000034: 4c0000ef jal ra,800004f4 +80000038: 00000517 auipc a0,0x0 +8000003c: 3c450513 addi a0,a0,964 # 800003fc <__libc_fini_array> +80000040: 374000ef jal ra,800003b4 +80000044: 414000ef jal ra,80000458 <__libc_init_array> +80000048: 008000ef jal ra,80000050
+8000004c: 37c0006f j 800003c8 + +Disassembly of section .text: + +80000050
: +80000050: 7ffff7b7 lui a5,0x7ffff +80000054: 0007a503 lw a0,0(a5) # 7ffff000 <__stack_size+0x7fffec00> +80000058: 800005b7 lui a1,0x80000 +8000005c: 7ffff637 lui a2,0x7ffff +80000060: 08058593 addi a1,a1,128 # 80000080 <__stack_top+0x81000080> +80000064: 2080006f j 8000026c + +80000068 : +80000068: 00000793 li a5,0 +8000006c: 00078863 beqz a5,8000007c +80000070: 80000537 lui a0,0x80000 +80000074: 3fc50513 addi a0,a0,1020 # 800003fc <__stack_top+0x810003fc> +80000078: 33c0006f j 800003b4 +8000007c: 00008067 ret + +80000080 : +80000080: 0085a783 lw a5,8(a1) +80000084: 00c5a603 lw a2,12(a1) +80000088: 0105a703 lw a4,16(a1) +8000008c: 02f506b3 mul a3,a0,a5 +80000090: 0145a883 lw a7,20(a1) +80000094: 0c078863 beqz a5,80000164 +80000098: 00d78833 add a6,a5,a3 +8000009c: f0000653 fmv.w.x fa2,zero +800000a0: 00269693 slli a3,a3,0x2 +800000a4: 00281813 slli a6,a6,0x2 +800000a8: 00c686b3 add a3,a3,a2 +800000ac: 00c80833 add a6,a6,a2 +800000b0: 40c888b3 sub a7,a7,a2 +800000b4: 0006a583 lw a1,0(a3) +800000b8: 0086a603 lw a2,8(a3) +800000bc: 00c6a503 lw a0,12(a3) +800000c0: 00259593 slli a1,a1,0x2 +800000c4: 00b705b3 add a1,a4,a1 +800000c8: 0005a787 flw fa5,0(a1) +800000cc: 0046a583 lw a1,4(a3) +800000d0: 00261613 slli a2,a2,0x2 +800000d4: 10f677d3 fmul.s fa5,fa2,fa5 +800000d8: 00259593 slli a1,a1,0x2 +800000dc: 00b705b3 add a1,a4,a1 +800000e0: 0005a687 flw fa3,0(a1) +800000e4: 00c70633 add a2,a4,a2 +800000e8: 00062707 flw fa4,0(a2) # 7ffff000 <__stack_size+0x7fffec00> +800000ec: 10d7f7d3 fmul.s fa5,fa5,fa3 +800000f0: 00251513 slli a0,a0,0x2 +800000f4: 00a70533 add a0,a4,a0 +800000f8: 0106a583 lw a1,16(a3) +800000fc: 0146a603 lw a2,20(a3) +80000100: 10e7f7d3 fmul.s fa5,fa5,fa4 +80000104: 00052707 flw fa4,0(a0) +80000108: 00259593 slli a1,a1,0x2 +8000010c: 00b705b3 add a1,a4,a1 +80000110: 0005a687 flw fa3,0(a1) +80000114: 10e7f7d3 fmul.s fa5,fa5,fa4 +80000118: 00261613 slli a2,a2,0x2 +8000011c: 00c70633 add a2,a4,a2 +80000120: 00062707 flw fa4,0(a2) +80000124: 0186a583 lw a1,24(a3) +80000128: 10d7f7d3 fmul.s fa5,fa5,fa3 +8000012c: 01c6a603 lw a2,28(a3) +80000130: 00259593 slli a1,a1,0x2 +80000134: 00b705b3 add a1,a4,a1 +80000138: 00261613 slli a2,a2,0x2 +8000013c: 10e7f7d3 fmul.s fa5,fa5,fa4 +80000140: 0005a707 flw fa4,0(a1) +80000144: 00c70633 add a2,a4,a2 +80000148: 00d887b3 add a5,a7,a3 +8000014c: 00468693 addi a3,a3,4 +80000150: 10e7f7d3 fmul.s fa5,fa5,fa4 +80000154: 00062707 flw fa4,0(a2) +80000158: 10f777d3 fmul.s fa5,fa4,fa5 +8000015c: 00f7a027 fsw fa5,0(a5) +80000160: f4d81ae3 bne a6,a3,800000b4 +80000164: 00008067 ret + +80000168 <_exit>: +80000168: 00000513 li a0,0 +8000016c: 0005006b 0x5006b + +80000170 : +80000170: fc002573 csrr a0,0xfc0 +80000174: 0005006b 0x5006b +80000178: 00002197 auipc gp,0x2 +8000017c: e2018193 addi gp,gp,-480 # 80001f98 <__global_pointer> +80000180: 7f000117 auipc sp,0x7f000 +80000184: e8010113 addi sp,sp,-384 # ff000000 <__stack_top> +80000188: 40000593 li a1,1024 +8000018c: cc102673 csrr a2,0xcc1 +80000190: 02c585b3 mul a1,a1,a2 +80000194: 40b10133 sub sp,sp,a1 +80000198: cc3026f3 csrr a3,0xcc3 +8000019c: 00068663 beqz a3,800001a8 +800001a0: 00000513 li a0,0 +800001a4: 0005006b 0x5006b + +800001a8 : +800001a8: 00008067 ret + +800001ac : +800001ac: fe010113 addi sp,sp,-32 +800001b0: 00112e23 sw ra,28(sp) +800001b4: 00812c23 sw s0,24(sp) +800001b8: 00912a23 sw s1,20(sp) +800001bc: 01212823 sw s2,16(sp) +800001c0: 01312623 sw s3,12(sp) +800001c4: fc0027f3 csrr a5,0xfc0 +800001c8: 0007806b 0x7806b +800001cc: cc5027f3 csrr a5,0xcc5 +800001d0: cc3029f3 csrr s3,0xcc3 +800001d4: cc002773 csrr a4,0xcc0 +800001d8: fc002673 csrr a2,0xfc0 +800001dc: 00279693 slli a3,a5,0x2 +800001e0: 800027b7 lui a5,0x80002 +800001e4: bc478793 addi a5,a5,-1084 # 80001bc4 <__stack_top+0x81001bc4> +800001e8: 00d787b3 add a5,a5,a3 +800001ec: 0007a483 lw s1,0(a5) +800001f0: 0104a403 lw s0,16(s1) +800001f4: 00c4a683 lw a3,12(s1) +800001f8: 0089a933 slt s2,s3,s0 +800001fc: 00040793 mv a5,s0 +80000200: 00d90933 add s2,s2,a3 +80000204: 03368433 mul s0,a3,s3 +80000208: 00f9d463 bge s3,a5,80000210 +8000020c: 00098793 mv a5,s3 +80000210: 00f40433 add s0,s0,a5 +80000214: 0084a683 lw a3,8(s1) +80000218: 02c40433 mul s0,s0,a2 +8000021c: 02e907b3 mul a5,s2,a4 +80000220: 00d40433 add s0,s0,a3 +80000224: 00f40433 add s0,s0,a5 +80000228: 00890933 add s2,s2,s0 +8000022c: 01245e63 bge s0,s2,80000248 +80000230: 0004a783 lw a5,0(s1) +80000234: 0044a583 lw a1,4(s1) +80000238: 00040513 mv a0,s0 +8000023c: 00140413 addi s0,s0,1 +80000240: 000780e7 jalr a5 +80000244: fe8916e3 bne s2,s0,80000230 +80000248: 0019b993 seqz s3,s3 +8000024c: 0009806b 0x9806b +80000250: 01c12083 lw ra,28(sp) +80000254: 01812403 lw s0,24(sp) +80000258: 01412483 lw s1,20(sp) +8000025c: 01012903 lw s2,16(sp) +80000260: 00c12983 lw s3,12(sp) +80000264: 02010113 addi sp,sp,32 +80000268: 00008067 ret + +8000026c : +8000026c: fc010113 addi sp,sp,-64 +80000270: 02112e23 sw ra,60(sp) +80000274: 02812c23 sw s0,56(sp) +80000278: 02912a23 sw s1,52(sp) +8000027c: 03212823 sw s2,48(sp) +80000280: 03312623 sw s3,44(sp) +80000284: fc2026f3 csrr a3,0xfc2 +80000288: fc102873 csrr a6,0xfc1 +8000028c: fc002473 csrr s0,0xfc0 +80000290: cc5027f3 csrr a5,0xcc5 +80000294: 01f00713 li a4,31 +80000298: 0cf74463 blt a4,a5,80000360 +8000029c: 030408b3 mul a7,s0,a6 +800002a0: 00100713 li a4,1 +800002a4: 00a8d463 bge a7,a0,800002ac +800002a8: 03154733 div a4,a0,a7 +800002ac: 0ce6c863 blt a3,a4,8000037c +800002b0: 0ae7d863 bge a5,a4,80000360 +800002b4: fff68693 addi a3,a3,-1 +800002b8: 02e54333 div t1,a0,a4 +800002bc: 00030893 mv a7,t1 +800002c0: 00f69663 bne a3,a5,800002cc +800002c4: 02e56533 rem a0,a0,a4 +800002c8: 006508b3 add a7,a0,t1 +800002cc: 0288c4b3 div s1,a7,s0 +800002d0: 0288e933 rem s2,a7,s0 +800002d4: 0b04ca63 blt s1,a6,80000388 +800002d8: 00100693 li a3,1 +800002dc: 0304c733 div a4,s1,a6 +800002e0: 00070663 beqz a4,800002ec +800002e4: 00070693 mv a3,a4 +800002e8: 0304e733 rem a4,s1,a6 +800002ec: 800029b7 lui s3,0x80002 +800002f0: bc498993 addi s3,s3,-1084 # 80001bc4 <__stack_top+0x81001bc4> +800002f4: 00e12e23 sw a4,28(sp) +800002f8: 00c10713 addi a4,sp,12 +800002fc: 00b12623 sw a1,12(sp) +80000300: 00c12823 sw a2,16(sp) +80000304: 00d12c23 sw a3,24(sp) +80000308: 02f30333 mul t1,t1,a5 +8000030c: 00279793 slli a5,a5,0x2 +80000310: 00f987b3 add a5,s3,a5 +80000314: 00e7a023 sw a4,0(a5) +80000318: 00612a23 sw t1,20(sp) +8000031c: 06904c63 bgtz s1,80000394 +80000320: 04090063 beqz s2,80000360 +80000324: 02848433 mul s0,s1,s0 +80000328: 00812a23 sw s0,20(sp) +8000032c: 0009006b 0x9006b +80000330: cc5027f3 csrr a5,0xcc5 +80000334: cc202573 csrr a0,0xcc2 +80000338: 00279793 slli a5,a5,0x2 +8000033c: 00f989b3 add s3,s3,a5 +80000340: 0009a783 lw a5,0(s3) +80000344: 0087a683 lw a3,8(a5) +80000348: 0007a703 lw a4,0(a5) +8000034c: 0047a583 lw a1,4(a5) +80000350: 00d50533 add a0,a0,a3 +80000354: 000700e7 jalr a4 +80000358: 00100793 li a5,1 +8000035c: 0007806b 0x7806b +80000360: 03c12083 lw ra,60(sp) +80000364: 03812403 lw s0,56(sp) +80000368: 03412483 lw s1,52(sp) +8000036c: 03012903 lw s2,48(sp) +80000370: 02c12983 lw s3,44(sp) +80000374: 04010113 addi sp,sp,64 +80000378: 00008067 ret +8000037c: 00068713 mv a4,a3 +80000380: f2e7cae3 blt a5,a4,800002b4 +80000384: fddff06f j 80000360 +80000388: 00000713 li a4,0 +8000038c: 00100693 li a3,1 +80000390: f5dff06f j 800002ec +80000394: 00048713 mv a4,s1 +80000398: 00985463 bge a6,s1,800003a0 +8000039c: 00080713 mv a4,a6 +800003a0: 800007b7 lui a5,0x80000 +800003a4: 1ac78793 addi a5,a5,428 # 800001ac <__stack_top+0x810001ac> +800003a8: 00f7106b 0xf7106b +800003ac: e01ff0ef jal ra,800001ac +800003b0: f71ff06f j 80000320 + +800003b4 : +800003b4: 00050593 mv a1,a0 +800003b8: 00000693 li a3,0 +800003bc: 00000613 li a2,0 +800003c0: 00000513 li a0,0 +800003c4: 20c0006f j 800005d0 <__register_exitproc> + +800003c8 : +800003c8: ff010113 addi sp,sp,-16 +800003cc: 00000593 li a1,0 +800003d0: 00812423 sw s0,8(sp) +800003d4: 00112623 sw ra,12(sp) +800003d8: 00050413 mv s0,a0 +800003dc: 290000ef jal ra,8000066c <__call_exitprocs> +800003e0: 800027b7 lui a5,0x80002 +800003e4: bc07a503 lw a0,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0> +800003e8: 03c52783 lw a5,60(a0) +800003ec: 00078463 beqz a5,800003f4 +800003f0: 000780e7 jalr a5 +800003f4: 00040513 mv a0,s0 +800003f8: d71ff0ef jal ra,80000168 <_exit> + +800003fc <__libc_fini_array>: +800003fc: ff010113 addi sp,sp,-16 +80000400: 00812423 sw s0,8(sp) +80000404: 800017b7 lui a5,0x80001 +80000408: 80001437 lui s0,0x80001 +8000040c: 79440413 addi s0,s0,1940 # 80001794 <__stack_top+0x81001794> +80000410: 79478793 addi a5,a5,1940 # 80001794 <__stack_top+0x81001794> +80000414: 408787b3 sub a5,a5,s0 +80000418: 00912223 sw s1,4(sp) +8000041c: 00112623 sw ra,12(sp) +80000420: 4027d493 srai s1,a5,0x2 +80000424: 02048063 beqz s1,80000444 <__libc_fini_array+0x48> +80000428: ffc78793 addi a5,a5,-4 +8000042c: 00878433 add s0,a5,s0 +80000430: 00042783 lw a5,0(s0) +80000434: fff48493 addi s1,s1,-1 +80000438: ffc40413 addi s0,s0,-4 +8000043c: 000780e7 jalr a5 +80000440: fe0498e3 bnez s1,80000430 <__libc_fini_array+0x34> +80000444: 00c12083 lw ra,12(sp) +80000448: 00812403 lw s0,8(sp) +8000044c: 00412483 lw s1,4(sp) +80000450: 01010113 addi sp,sp,16 +80000454: 00008067 ret + +80000458 <__libc_init_array>: +80000458: ff010113 addi sp,sp,-16 +8000045c: 00812423 sw s0,8(sp) +80000460: 01212023 sw s2,0(sp) +80000464: 80001437 lui s0,0x80001 +80000468: 80001937 lui s2,0x80001 +8000046c: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790> +80000470: 79090913 addi s2,s2,1936 # 80001790 <__stack_top+0x81001790> +80000474: 40f90933 sub s2,s2,a5 +80000478: 00112623 sw ra,12(sp) +8000047c: 00912223 sw s1,4(sp) +80000480: 40295913 srai s2,s2,0x2 +80000484: 02090063 beqz s2,800004a4 <__libc_init_array+0x4c> +80000488: 79040413 addi s0,s0,1936 +8000048c: 00000493 li s1,0 +80000490: 00042783 lw a5,0(s0) +80000494: 00148493 addi s1,s1,1 +80000498: 00440413 addi s0,s0,4 +8000049c: 000780e7 jalr a5 +800004a0: fe9918e3 bne s2,s1,80000490 <__libc_init_array+0x38> +800004a4: 80001437 lui s0,0x80001 +800004a8: 80001937 lui s2,0x80001 +800004ac: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790> +800004b0: 79490913 addi s2,s2,1940 # 80001794 <__stack_top+0x81001794> +800004b4: 40f90933 sub s2,s2,a5 +800004b8: 40295913 srai s2,s2,0x2 +800004bc: 02090063 beqz s2,800004dc <__libc_init_array+0x84> +800004c0: 79040413 addi s0,s0,1936 +800004c4: 00000493 li s1,0 +800004c8: 00042783 lw a5,0(s0) +800004cc: 00148493 addi s1,s1,1 +800004d0: 00440413 addi s0,s0,4 +800004d4: 000780e7 jalr a5 +800004d8: fe9918e3 bne s2,s1,800004c8 <__libc_init_array+0x70> +800004dc: 00c12083 lw ra,12(sp) +800004e0: 00812403 lw s0,8(sp) +800004e4: 00412483 lw s1,4(sp) +800004e8: 00012903 lw s2,0(sp) +800004ec: 01010113 addi sp,sp,16 +800004f0: 00008067 ret + +800004f4 : +800004f4: 00f00313 li t1,15 +800004f8: 00050713 mv a4,a0 +800004fc: 02c37e63 bgeu t1,a2,80000538 +80000500: 00f77793 andi a5,a4,15 +80000504: 0a079063 bnez a5,800005a4 +80000508: 08059263 bnez a1,8000058c +8000050c: ff067693 andi a3,a2,-16 +80000510: 00f67613 andi a2,a2,15 +80000514: 00e686b3 add a3,a3,a4 +80000518: 00b72023 sw a1,0(a4) +8000051c: 00b72223 sw a1,4(a4) +80000520: 00b72423 sw a1,8(a4) +80000524: 00b72623 sw a1,12(a4) +80000528: 01070713 addi a4,a4,16 +8000052c: fed766e3 bltu a4,a3,80000518 +80000530: 00061463 bnez a2,80000538 +80000534: 00008067 ret +80000538: 40c306b3 sub a3,t1,a2 +8000053c: 00269693 slli a3,a3,0x2 +80000540: 00000297 auipc t0,0x0 +80000544: 005686b3 add a3,a3,t0 +80000548: 00c68067 jr 12(a3) +8000054c: 00b70723 sb a1,14(a4) +80000550: 00b706a3 sb a1,13(a4) +80000554: 00b70623 sb a1,12(a4) +80000558: 00b705a3 sb a1,11(a4) +8000055c: 00b70523 sb a1,10(a4) +80000560: 00b704a3 sb a1,9(a4) +80000564: 00b70423 sb a1,8(a4) +80000568: 00b703a3 sb a1,7(a4) +8000056c: 00b70323 sb a1,6(a4) +80000570: 00b702a3 sb a1,5(a4) +80000574: 00b70223 sb a1,4(a4) +80000578: 00b701a3 sb a1,3(a4) +8000057c: 00b70123 sb a1,2(a4) +80000580: 00b700a3 sb a1,1(a4) +80000584: 00b70023 sb a1,0(a4) +80000588: 00008067 ret +8000058c: 0ff5f593 andi a1,a1,255 +80000590: 00859693 slli a3,a1,0x8 +80000594: 00d5e5b3 or a1,a1,a3 +80000598: 01059693 slli a3,a1,0x10 +8000059c: 00d5e5b3 or a1,a1,a3 +800005a0: f6dff06f j 8000050c +800005a4: 00279693 slli a3,a5,0x2 +800005a8: 00000297 auipc t0,0x0 +800005ac: 005686b3 add a3,a3,t0 +800005b0: 00008293 mv t0,ra +800005b4: fa0680e7 jalr -96(a3) +800005b8: 00028093 mv ra,t0 +800005bc: ff078793 addi a5,a5,-16 +800005c0: 40f70733 sub a4,a4,a5 +800005c4: 00f60633 add a2,a2,a5 +800005c8: f6c378e3 bgeu t1,a2,80000538 +800005cc: f3dff06f j 80000508 + +800005d0 <__register_exitproc>: +800005d0: 800027b7 lui a5,0x80002 +800005d4: bc07a703 lw a4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0> +800005d8: 14872783 lw a5,328(a4) +800005dc: 04078c63 beqz a5,80000634 <__register_exitproc+0x64> +800005e0: 0047a703 lw a4,4(a5) +800005e4: 01f00813 li a6,31 +800005e8: 06e84e63 blt a6,a4,80000664 <__register_exitproc+0x94> +800005ec: 00271813 slli a6,a4,0x2 +800005f0: 02050663 beqz a0,8000061c <__register_exitproc+0x4c> +800005f4: 01078333 add t1,a5,a6 +800005f8: 08c32423 sw a2,136(t1) +800005fc: 1887a883 lw a7,392(a5) +80000600: 00100613 li a2,1 +80000604: 00e61633 sll a2,a2,a4 +80000608: 00c8e8b3 or a7,a7,a2 +8000060c: 1917a423 sw a7,392(a5) +80000610: 10d32423 sw a3,264(t1) +80000614: 00200693 li a3,2 +80000618: 02d50463 beq a0,a3,80000640 <__register_exitproc+0x70> +8000061c: 00170713 addi a4,a4,1 +80000620: 00e7a223 sw a4,4(a5) +80000624: 010787b3 add a5,a5,a6 +80000628: 00b7a423 sw a1,8(a5) +8000062c: 00000513 li a0,0 +80000630: 00008067 ret +80000634: 14c70793 addi a5,a4,332 +80000638: 14f72423 sw a5,328(a4) +8000063c: fa5ff06f j 800005e0 <__register_exitproc+0x10> +80000640: 18c7a683 lw a3,396(a5) +80000644: 00170713 addi a4,a4,1 +80000648: 00e7a223 sw a4,4(a5) +8000064c: 00c6e633 or a2,a3,a2 +80000650: 18c7a623 sw a2,396(a5) +80000654: 010787b3 add a5,a5,a6 +80000658: 00b7a423 sw a1,8(a5) +8000065c: 00000513 li a0,0 +80000660: 00008067 ret +80000664: fff00513 li a0,-1 +80000668: 00008067 ret + +8000066c <__call_exitprocs>: +8000066c: fd010113 addi sp,sp,-48 +80000670: 800027b7 lui a5,0x80002 +80000674: 01412c23 sw s4,24(sp) +80000678: bc07aa03 lw s4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0> +8000067c: 03212023 sw s2,32(sp) +80000680: 02112623 sw ra,44(sp) +80000684: 148a2903 lw s2,328(s4) +80000688: 02812423 sw s0,40(sp) +8000068c: 02912223 sw s1,36(sp) +80000690: 01312e23 sw s3,28(sp) +80000694: 01512a23 sw s5,20(sp) +80000698: 01612823 sw s6,16(sp) +8000069c: 01712623 sw s7,12(sp) +800006a0: 01812423 sw s8,8(sp) +800006a4: 04090063 beqz s2,800006e4 <__call_exitprocs+0x78> +800006a8: 00050b13 mv s6,a0 +800006ac: 00058b93 mv s7,a1 +800006b0: 00100a93 li s5,1 +800006b4: fff00993 li s3,-1 +800006b8: 00492483 lw s1,4(s2) +800006bc: fff48413 addi s0,s1,-1 +800006c0: 02044263 bltz s0,800006e4 <__call_exitprocs+0x78> +800006c4: 00249493 slli s1,s1,0x2 +800006c8: 009904b3 add s1,s2,s1 +800006cc: 040b8463 beqz s7,80000714 <__call_exitprocs+0xa8> +800006d0: 1044a783 lw a5,260(s1) +800006d4: 05778063 beq a5,s7,80000714 <__call_exitprocs+0xa8> +800006d8: fff40413 addi s0,s0,-1 +800006dc: ffc48493 addi s1,s1,-4 +800006e0: ff3416e3 bne s0,s3,800006cc <__call_exitprocs+0x60> +800006e4: 02c12083 lw ra,44(sp) +800006e8: 02812403 lw s0,40(sp) +800006ec: 02412483 lw s1,36(sp) +800006f0: 02012903 lw s2,32(sp) +800006f4: 01c12983 lw s3,28(sp) +800006f8: 01812a03 lw s4,24(sp) +800006fc: 01412a83 lw s5,20(sp) +80000700: 01012b03 lw s6,16(sp) +80000704: 00c12b83 lw s7,12(sp) +80000708: 00812c03 lw s8,8(sp) +8000070c: 03010113 addi sp,sp,48 +80000710: 00008067 ret +80000714: 00492783 lw a5,4(s2) +80000718: 0044a683 lw a3,4(s1) +8000071c: fff78793 addi a5,a5,-1 +80000720: 04878e63 beq a5,s0,8000077c <__call_exitprocs+0x110> +80000724: 0004a223 sw zero,4(s1) +80000728: fa0688e3 beqz a3,800006d8 <__call_exitprocs+0x6c> +8000072c: 18892783 lw a5,392(s2) +80000730: 008a9733 sll a4,s5,s0 +80000734: 00492c03 lw s8,4(s2) +80000738: 00f777b3 and a5,a4,a5 +8000073c: 02079263 bnez a5,80000760 <__call_exitprocs+0xf4> +80000740: 000680e7 jalr a3 +80000744: 00492703 lw a4,4(s2) +80000748: 148a2783 lw a5,328(s4) +8000074c: 01871463 bne a4,s8,80000754 <__call_exitprocs+0xe8> +80000750: f92784e3 beq a5,s2,800006d8 <__call_exitprocs+0x6c> +80000754: f80788e3 beqz a5,800006e4 <__call_exitprocs+0x78> +80000758: 00078913 mv s2,a5 +8000075c: f5dff06f j 800006b8 <__call_exitprocs+0x4c> +80000760: 18c92783 lw a5,396(s2) +80000764: 0844a583 lw a1,132(s1) +80000768: 00f77733 and a4,a4,a5 +8000076c: 00071c63 bnez a4,80000784 <__call_exitprocs+0x118> +80000770: 000b0513 mv a0,s6 +80000774: 000680e7 jalr a3 +80000778: fcdff06f j 80000744 <__call_exitprocs+0xd8> +8000077c: 00892223 sw s0,4(s2) +80000780: fa9ff06f j 80000728 <__call_exitprocs+0xbc> +80000784: 00058513 mv a0,a1 +80000788: 000680e7 jalr a3 +8000078c: fb9ff06f j 80000744 <__call_exitprocs+0xd8> + +Disassembly of section .init_array: + +80001790 <__init_array_start>: +80001790: 0068 addi a0,sp,12 +80001792: 8000 0x8000 + +Disassembly of section .data: + +80001798 : +80001798: 0000 unimp +8000179a: 0000 unimp +8000179c: 1a84 addi s1,sp,368 +8000179e: 8000 0x8000 +800017a0: 1aec addi a1,sp,380 +800017a2: 8000 0x8000 +800017a4: 1b54 addi a3,sp,436 +800017a6: 8000 0x8000 + ... +80001840: 0001 nop +80001842: 0000 unimp +80001844: 0000 unimp +80001846: 0000 unimp +80001848: 330e fld ft6,224(sp) +8000184a: abcd j 80001e3c <__BSS_END__+0x1f8> +8000184c: 1234 addi a3,sp,296 +8000184e: e66d bnez a2,80001938 +80001850: deec sw a1,124(a3) +80001852: 0005 c.nop 1 +80001854: 0000000b 0xb + ... + +Disassembly of section .sdata: + +80001bc0 <_global_impure_ptr>: +80001bc0: 1798 addi a4,sp,992 +80001bc2: 8000 0x8000 + +Disassembly of section .bss: + +80001bc4 : + ... + +Disassembly of section .comment: + +00000000 <.comment>: + 0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm + 4: 2820 fld fs0,80(s0) + 6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm + a: 3120 fld fs0,96(a0) + c: 2e30 fld fa2,88(a2) + e: 2e32 fld ft8,264(sp) + 10: 0030 addi a2,sp,8 + +Disassembly of section .riscv.attributes: + +00000000 <.riscv.attributes>: + 0: 2941 jal 490 <__stack_size+0x90> + 2: 0000 unimp + 4: 7200 flw fs0,32(a2) + 6: 7369 lui t1,0xffffa + 8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14> + c: 001f 0000 1004 0x10040000001f + 12: 7205 lui tp,0xfffe1 + 14: 3376 fld ft6,376(sp) + 16: 6932 flw fs2,12(sp) + 18: 7032 flw ft0,44(sp) + 1a: 5f30 lw a2,120(a4) + 1c: 326d jal fffff9c6 <__stack_top+0xfff9c6> + 1e: 3070 fld fa2,224(s0) + 20: 665f 7032 0030 0x307032665f + 26: 0108 addi a0,sp,128 + 28: 0b0a slli s6,s6,0x2 diff --git a/driver/tests/stress/kernel.elf b/driver/tests/stress/kernel.elf new file mode 100755 index 00000000..b107e920 Binary files /dev/null and b/driver/tests/stress/kernel.elf differ diff --git a/driver/tests/stress/main.cpp b/driver/tests/stress/main.cpp new file mode 100644 index 00000000..1329d5ba --- /dev/null +++ b/driver/tests/stress/main.cpp @@ -0,0 +1,293 @@ +#include +#include +#include +#include +#include "common.h" +#include +#include +#include +#include + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +union Float_t { + float f; + int i; + struct { + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } parts; +}; + +inline float fround(float x, int32_t precision = 8) { + auto power_of_10 = std::pow(10, precision); + return std::round(x * power_of_10) / power_of_10; +} + +inline bool almost_equal_eps(float a, float b, int ulp = 128) { + auto eps = std::numeric_limits::epsilon() * (std::max(fabs(a), fabs(b)) * ulp); + auto d = fabs(a - b); + if (d > eps) { + std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl; + return false; + } + return true; +} + +inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) { + Float_t fa{a}, fb{b}; + auto d = std::abs(fa.i - fb.i); + if (d > ulp) { + std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + return false; + } + return true; +} + +inline bool almost_equal(float a, float b) { + if (a == b) + return true; + /*if (almost_equal_eps(a, b)) + return true;*/ + return almost_equal_ulp(a, b); +} + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector test_data; +std::vector addr_table; + +vx_device_h device = nullptr; +vx_buffer_h staging_buf = nullptr; + +static void show_usage() { + std::cout << "Vortex Driver Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (staging_buf) { + vx_buf_release(staging_buf); + } + if (device) { + vx_dev_close(device); + } +} + +void gen_input_data(uint32_t num_points) { + test_data.resize(num_points); + addr_table.resize(num_points + NUM_LOADS - 1); + + for (uint32_t i = 0; i < test_data.size(); ++i) { + float r = static_cast(std::rand()) / RAND_MAX; + test_data[i] = r; + } + + for (uint32_t i = 0; i < addr_table.size(); ++i) { + float r = static_cast(std::rand()) / RAND_MAX; + uint32_t index = static_cast(r * num_points); + assert(index < num_points); + addr_table[i] = index; + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t dst_buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)vx_host_ptr(staging_buf); + + for (uint32_t i = 0; i < num_points; ++i) { + + float ref = 0.0f; + for (uint32_t j = 0; j < NUM_LOADS; ++j) { + uint32_t addr = i + j; + uint32_t index = addr_table.at(addr); + float value = test_data.at(index); + //printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value); + ref *= value; + } + + float cur = buf_ptr[i]; + if (!almost_equal(cur, ref)) { + std::cout << "error at result #" << std::dec << i + << ": actual " << cur << ", expected " << ref << std::endl; + ++errors; + } + } + + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + size_t value; + kernel_arg_t kernel_arg; + + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores, max_warps, max_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); + + uint32_t num_tasks = max_cores * max_warps * max_threads; + uint32_t num_points = count * num_tasks; + + // generate input data + gen_input_data(num_points); + + uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t); + uint32_t src_buf_size = test_data.size() * sizeof(int32_t); + uint32_t dst_buf_size = test_data.size() * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + + RT_CHECK(vx_alloc_dev_mem(device, addr_buf_size, &value)); + kernel_arg.addr_ptr = value; + RT_CHECK(vx_alloc_dev_mem(device, src_buf_size, &value)); + kernel_arg.src_ptr = value; + RT_CHECK(vx_alloc_dev_mem(device, dst_buf_size, &value)); + kernel_arg.dst_ptr = value; + + kernel_arg.num_tasks = num_tasks; + kernel_arg.stride = count; + + std::cout << "dev_addr=" << std::hex << kernel_arg.addr_ptr << std::endl; + std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl; + std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl; + + // allocate shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t staging_buf_size = std::max(src_buf_size, + std::max(addr_buf_size, + std::max(dst_buf_size, + sizeof(kernel_arg_t)))); + RT_CHECK(vx_alloc_shared_mem(device, staging_buf_size, &staging_buf)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + auto buf_ptr = (int*)vx_host_ptr(staging_buf); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload source buffer0 + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < addr_table.size(); ++i) { + buf_ptr[i] = addr_table.at(i); + } + } + std::cout << "upload address buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.addr_ptr, addr_buf_size, 0)); + + // upload source buffer1 + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < test_data.size(); ++i) { + buf_ptr[i] = test_data.at(i); + } + } + std::cout << "upload source buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, src_buf_size, 0)); + + // clear destination buffer + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < test_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + } + std::cout << "clear destination buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/evaluation/scripts/README.txt b/evaluation/scripts/README.txt index 908b0147..dbf20831 100644 --- a/evaluation/scripts/README.txt +++ b/evaluation/scripts/README.txt @@ -5,19 +5,16 @@ Description: Makes the build in the opae directory with the specified core exists, a make clean command is ran before the build. Script waits until the inteldev script or quartus program is finished running. -Usage: ./build.sh -c [1|2|4|8|16] [-p perf] [-w wait] +Usage: ./build.sh -c [1|2|4|8|16] [-p [y|n]] Options: -c Core count (1, 2, 4, 8, or 16). -p - Performance profiling enable. Changes the source file in the + Performance profiling enable (y or n). Changes the source file in the opae directory to include/exclude "+define+PERF_ENABLE". - -w - Wait for the build to complete - _______________________________________________________________________________ @@ -27,6 +24,7 @@ Description: Runs build.sh with performance profiling enabled for all valid core configurations. _______________________________________________________________________________ +_______________________________________________________________________________ -program_fpga.sh- @@ -41,6 +39,7 @@ Options: Core count (1, 2, 4, 8, or 16). _______________________________________________________________________________ +_______________________________________________________________________________ -gather_perf_results.sh- @@ -65,3 +64,53 @@ _______________________________________________________________________________ Description: Programs fpga and runs gather_perf_results.sh for all valid core configurations. All builds should already be made before running this. + +_______________________________________________________________________________ +_______________________________________________________________________________ + + +-export_csv.sh- + +Description: Creates specified .csv output file from an input directory, file, +and parameter. The .csv file contains two columns: cores, and the input +parameter. The output file is located within the directory specified with -d. + +Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o + [output filename] -p '[parameter]' + +Example: ./export_csv.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv + -p 'PERF: scoreboard stalls' + +Options: + -c + Upper limit of cores to be read in. Core directories should exist in + the directory specified by -d e.g. 1c, 2c, 4c for -c 4. + + -d + The directory of the form perf_{date} located in the evaluation + directory. + + -i + The input filename located in each core directory within the + directory specified by -d. + + -o + The output filename to be created within the directory specified + by -d. + + -p + The parameter corresponding to the core count in the .csv file. The + full name of the parameter from the start of the line should be + inputted to avoid the parameter name being matched multiple times. + +_______________________________________________________________________________ + + +-export_ipc_csv.sh- + +Description: Runs export_csv.sh for the parameter IPC. + +Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o + [output filename] + +Example: ./export_ipc.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv diff --git a/evaluation/scripts/build.sh b/evaluation/scripts/build.sh index 21b9f345..94a6a736 100755 --- a/evaluation/scripts/build.sh +++ b/evaluation/scripts/build.sh @@ -28,26 +28,15 @@ fi cd ${BUILD_DIR} -sources_file="./sources_${cores}c.txt" - -if [ ${perf} = 1 ]; then - if grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then - sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file} - elif ! grep -Fxq '+define+PERF_ENABLE' ${sources_file}; then - sed -i '1s/^/+define+PERF_ENABLE\n/' ${sources_file} - fi -else - if grep -v '^ *#' ${sources_file} | grep -Fxq '+define+SYNTHESIS'; then - sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file} - elif ! grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then - sed -i '1s/^/#+define+PERF_ENABLE\n/' ${sources_file} - fi -fi - if [ -d "./build_fpga_{$cores}c" ]; then make "clean-fpga-${cores}c" fi -make "fpga-${cores}c" + +if [ ${perf} = 1 ]; then + PERF=1 make "fpga-${cores}c" +else + make "fpga-${cores}c" +fi if [ ${wait} = 1 ]; then sleep 30 diff --git a/evaluation/scripts/export_csv.sh b/evaluation/scripts/export_csv.sh new file mode 100755 index 00000000..8f95a71b --- /dev/null +++ b/evaluation/scripts/export_csv.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +while getopts c:d:i:o:p: flag +do + case "${flag}" in + c) cores=${OPTARG};; #1, 2, 4, 8, 16 + d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07) + i) ifile=${OPTARG};; #input filename + o) ofile=${OPTARG};; #output filename + p) param=${OPTARG};; #parameter to be made into csv + esac +done + +if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then + echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)' + exit 1 +fi + +if [ -z "$ifile" ]; then + echo 'No input filename given for argument -f' + exit 1 +fi + +if [ -z "$dir" ]; then + echo 'No directory given for argument -d' + exit 1 +fi + +printf "cores,${param}\n" > "../${dir}/${ofile}" +for ((i=1; i<=$cores; i=i*2)); do + printf "${i}," >> "../${dir}/${ofile}" + (sed -n "s/${param}=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}") >> "../${dir}/${ofile}" +done diff --git a/evaluation/scripts/export_ipc_csv.sh b/evaluation/scripts/export_ipc_csv.sh new file mode 100755 index 00000000..f698525b --- /dev/null +++ b/evaluation/scripts/export_ipc_csv.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +while getopts c:d:f:o: flag +do + case "${flag}" in + c) cores=${OPTARG};; #1, 2, 4, 8, 16 + d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07) + i) ifile=${OPTARG};; #input filename + o) ofile=${OPTARG};; #output filename + esac +done + +if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then + echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)' + exit 1 +fi + +if [ -z "$ifile" ]; then + echo 'No input filename given for argument -f' + exit 1 +fi + +if [ -z "$dir" ]; then + echo 'No directory given for argument -d' + exit 1 +fi + +printf "cores,IPC" > "../${dir}/${ofile}" +for ((i=1; i<=$cores; i=i*2)); do + printf "${i}," >> "../${dir}/${ofile}" + (sed -n "s/IPC=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}" | awk 'END {print $NF}') >> "../${dir}/${ofile}" +done diff --git a/hw/Makefile b/hw/Makefile index 635b4d15..425e58d9 100644 --- a/hw/Makefile +++ b/hw/Makefile @@ -1,9 +1,9 @@ .PHONY: build_config -build_config: - ./scripts/gen_config.py --outv ./rtl/VX_user_config.vh --outc ./VX_config.h +build_config: ./rtl/VX_config.vh + ./scripts/gen_config.py -i ./rtl/VX_config.vh -o ./VX_config.h $(MAKE) -C simulate clean: - rm -f ./rtl/VX_user_config.vh ./VX_config.h + rm -f ./VX_config.h $(MAKE) -C simulate clean \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index ea9a766b..d9521b8e 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -9,20 +9,20 @@ module VX_cluster #( input wire clk, input wire reset, - // DRAM request - output wire dram_req_valid, - output wire dram_req_rw, - output wire [`L2DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen, - output wire [`L2DRAM_ADDR_WIDTH-1:0] dram_req_addr, - output wire [`L2DRAM_LINE_WIDTH-1:0] dram_req_data, - output wire [`L2DRAM_TAG_WIDTH-1:0] dram_req_tag, - input wire dram_req_ready, + // Memory request + output wire mem_req_valid, + output wire mem_req_rw, + output wire [`L2MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, + output wire [`L2MEM_ADDR_WIDTH-1:0] mem_req_addr, + output wire [`L2MEM_LINE_WIDTH-1:0] mem_req_data, + output wire [`L2MEM_TAG_WIDTH-1:0] mem_req_tag, + input wire mem_req_ready, - // DRAM response - input wire dram_rsp_valid, - input wire [`L2DRAM_LINE_WIDTH-1:0] dram_rsp_data, - input wire [`L2DRAM_TAG_WIDTH-1:0] dram_rsp_tag, - output wire dram_rsp_ready, + // Memory response + input wire mem_rsp_valid, + input wire [`L2MEM_LINE_WIDTH-1:0] mem_rsp_data, + input wire [`L2MEM_TAG_WIDTH-1:0] mem_rsp_tag, + output wire mem_rsp_ready, // CSR Request input wire csr_req_valid, @@ -42,31 +42,31 @@ module VX_cluster #( output wire ebreak ); - wire [`NUM_CORES-1:0] per_core_dram_req_valid; - wire [`NUM_CORES-1:0] per_core_dram_req_rw; - wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen; - wire [`NUM_CORES-1:0][`DDRAM_ADDR_WIDTH-1:0] per_core_dram_req_addr; - wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_req_data; - wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_req_tag; - wire [`NUM_CORES-1:0] per_core_dram_req_ready; + wire [`NUM_CORES-1:0] per_core_mem_req_valid; + wire [`NUM_CORES-1:0] per_core_mem_req_rw; + wire [`NUM_CORES-1:0][`DMEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen; + wire [`NUM_CORES-1:0][`DMEM_ADDR_WIDTH-1:0] per_core_mem_req_addr; + wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_req_data; + wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_req_tag; + wire [`NUM_CORES-1:0] per_core_mem_req_ready; - wire [`NUM_CORES-1:0] per_core_dram_rsp_valid; - wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_rsp_data; - wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_rsp_tag; - wire [`NUM_CORES-1:0] per_core_dram_rsp_ready; + wire [`NUM_CORES-1:0] per_core_mem_rsp_valid; + wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_rsp_data; + wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag; + wire [`NUM_CORES-1:0] per_core_mem_rsp_ready; - wire [`NUM_CORES-1:0] per_core_csr_req_valid; - wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr; - wire [`NUM_CORES-1:0] per_core_csr_req_rw; - wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data; - wire [`NUM_CORES-1:0] per_core_csr_req_ready; + wire [`NUM_CORES-1:0] per_core_csr_req_valid; + wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr; + wire [`NUM_CORES-1:0] per_core_csr_req_rw; + wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data; + wire [`NUM_CORES-1:0] per_core_csr_req_ready; - wire [`NUM_CORES-1:0] per_core_csr_rsp_valid; - wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data; - wire [`NUM_CORES-1:0] per_core_csr_rsp_ready; + wire [`NUM_CORES-1:0] per_core_csr_rsp_valid; + wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data; + wire [`NUM_CORES-1:0] per_core_csr_rsp_ready; - wire [`NUM_CORES-1:0] per_core_busy; - wire [`NUM_CORES-1:0] per_core_ebreak; + wire [`NUM_CORES-1:0] per_core_busy; + wire [`NUM_CORES-1:0] per_core_ebreak; for (genvar i = 0; i < `NUM_CORES; i++) begin @@ -87,18 +87,18 @@ module VX_cluster #( .clk (clk), .reset (core_reset), - .dram_req_valid (per_core_dram_req_valid[i]), - .dram_req_rw (per_core_dram_req_rw [i]), - .dram_req_byteen(per_core_dram_req_byteen[i]), - .dram_req_addr (per_core_dram_req_addr [i]), - .dram_req_data (per_core_dram_req_data [i]), - .dram_req_tag (per_core_dram_req_tag [i]), - .dram_req_ready (per_core_dram_req_ready[i]), + .mem_req_valid (per_core_mem_req_valid[i]), + .mem_req_rw (per_core_mem_req_rw [i]), + .mem_req_byteen (per_core_mem_req_byteen[i]), + .mem_req_addr (per_core_mem_req_addr [i]), + .mem_req_data (per_core_mem_req_data [i]), + .mem_req_tag (per_core_mem_req_tag [i]), + .mem_req_ready (per_core_mem_req_ready[i]), - .dram_rsp_valid (per_core_dram_rsp_valid[i]), - .dram_rsp_data (per_core_dram_rsp_data [i]), - .dram_rsp_tag (per_core_dram_rsp_tag [i]), - .dram_rsp_ready (per_core_dram_rsp_ready[i]), + .mem_rsp_valid (per_core_mem_rsp_valid[i]), + .mem_rsp_data (per_core_mem_rsp_data [i]), + .mem_rsp_tag (per_core_mem_rsp_tag [i]), + .mem_rsp_ready (per_core_mem_rsp_ready[i]), .csr_req_valid (per_core_csr_req_valid [i]), .csr_req_rw (per_core_csr_req_rw [i]), @@ -169,12 +169,12 @@ module VX_cluster #( .NUM_REQS (`NUM_CORES), .CREQ_SIZE (`L2CREQ_SIZE), .MSHR_SIZE (`L2MSHR_SIZE), - .DRSQ_SIZE (`L2DRSQ_SIZE), - .DREQ_SIZE (`L2DREQ_SIZE), + .MRSQ_SIZE (`L2MRSQ_SIZE), + .MREQ_SIZE (`L2MREQ_SIZE), .WRITE_ENABLE (1), - .CORE_TAG_WIDTH (`XDRAM_TAG_WIDTH), + .CORE_TAG_WIDTH (`XMEM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), - .DRAM_TAG_WIDTH (`L2DRAM_TAG_WIDTH) + .MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH) ) l2cache ( `SCOPE_BIND_VX_cluster_l2cache @@ -188,78 +188,78 @@ module VX_cluster #( `endif // Core request - .core_req_valid (per_core_dram_req_valid), - .core_req_rw (per_core_dram_req_rw), - .core_req_byteen (per_core_dram_req_byteen), - .core_req_addr (per_core_dram_req_addr), - .core_req_data (per_core_dram_req_data), - .core_req_tag (per_core_dram_req_tag), - .core_req_ready (per_core_dram_req_ready), + .core_req_valid (per_core_mem_req_valid), + .core_req_rw (per_core_mem_req_rw), + .core_req_byteen (per_core_mem_req_byteen), + .core_req_addr (per_core_mem_req_addr), + .core_req_data (per_core_mem_req_data), + .core_req_tag (per_core_mem_req_tag), + .core_req_ready (per_core_mem_req_ready), // Core response - .core_rsp_valid (per_core_dram_rsp_valid), - .core_rsp_data (per_core_dram_rsp_data), - .core_rsp_tag (per_core_dram_rsp_tag), - .core_rsp_ready (per_core_dram_rsp_ready), + .core_rsp_valid (per_core_mem_rsp_valid), + .core_rsp_data (per_core_mem_rsp_data), + .core_rsp_tag (per_core_mem_rsp_tag), + .core_rsp_ready (per_core_mem_rsp_ready), - // DRAM request - .dram_req_valid (dram_req_valid), - .dram_req_rw (dram_req_rw), - .dram_req_byteen (dram_req_byteen), - .dram_req_addr (dram_req_addr), - .dram_req_data (dram_req_data), - .dram_req_tag (dram_req_tag), - .dram_req_ready (dram_req_ready), + // Memory request + .mem_req_valid (mem_req_valid), + .mem_req_rw (mem_req_rw), + .mem_req_byteen (mem_req_byteen), + .mem_req_addr (mem_req_addr), + .mem_req_data (mem_req_data), + .mem_req_tag (mem_req_tag), + .mem_req_ready (mem_req_ready), - // DRAM response - .dram_rsp_valid (dram_rsp_valid), - .dram_rsp_tag (dram_rsp_tag), - .dram_rsp_data (dram_rsp_data), - .dram_rsp_ready (dram_rsp_ready) + // Memory response + .mem_rsp_valid (mem_rsp_valid), + .mem_rsp_tag (mem_rsp_tag), + .mem_rsp_data (mem_rsp_data), + .mem_rsp_ready (mem_rsp_ready) ); end else begin VX_mem_arb #( - .NUM_REQS (`NUM_CORES), - .DATA_WIDTH (`L2DRAM_LINE_WIDTH), - .TAG_IN_WIDTH (`XDRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH), - .BUFFERED_REQ (1), - .BUFFERED_RSP (1) - ) dram_arb ( + .NUM_REQS (`NUM_CORES), + .DATA_WIDTH (`L2MEM_LINE_WIDTH), + .TAG_IN_WIDTH (`XMEM_TAG_WIDTH), + .TAG_OUT_WIDTH (`L2MEM_TAG_WIDTH), + .BUFFERED_REQ (1), + .BUFFERED_RSP (1) + ) mem_arb ( .clk (clk), .reset (reset), // Core request - .req_valid_in (per_core_dram_req_valid), - .req_rw_in (per_core_dram_req_rw), - .req_byteen_in (per_core_dram_req_byteen), - .req_addr_in (per_core_dram_req_addr), - .req_data_in (per_core_dram_req_data), - .req_tag_in (per_core_dram_req_tag), - .req_ready_in (per_core_dram_req_ready), + .req_valid_in (per_core_mem_req_valid), + .req_rw_in (per_core_mem_req_rw), + .req_byteen_in (per_core_mem_req_byteen), + .req_addr_in (per_core_mem_req_addr), + .req_data_in (per_core_mem_req_data), + .req_tag_in (per_core_mem_req_tag), + .req_ready_in (per_core_mem_req_ready), - // DRAM request - .req_valid_out (dram_req_valid), - .req_rw_out (dram_req_rw), - .req_byteen_out (dram_req_byteen), - .req_addr_out (dram_req_addr), - .req_data_out (dram_req_data), - .req_tag_out (dram_req_tag), - .req_ready_out (dram_req_ready), + // Memory request + .req_valid_out (mem_req_valid), + .req_rw_out (mem_req_rw), + .req_byteen_out (mem_req_byteen), + .req_addr_out (mem_req_addr), + .req_data_out (mem_req_data), + .req_tag_out (mem_req_tag), + .req_ready_out (mem_req_ready), // Core response - .rsp_valid_out (per_core_dram_rsp_valid), - .rsp_data_out (per_core_dram_rsp_data), - .rsp_tag_out (per_core_dram_rsp_tag), - .rsp_ready_out (per_core_dram_rsp_ready), + .rsp_valid_out (per_core_mem_rsp_valid), + .rsp_data_out (per_core_mem_rsp_data), + .rsp_tag_out (per_core_mem_rsp_tag), + .rsp_ready_out (per_core_mem_rsp_ready), - // DRAM response - .rsp_valid_in (dram_rsp_valid), - .rsp_tag_in (dram_rsp_tag), - .rsp_data_in (dram_rsp_data), - .rsp_ready_in (dram_rsp_ready) + // Memory response + .rsp_valid_in (mem_rsp_valid), + .rsp_tag_in (mem_rsp_tag), + .rsp_data_in (mem_rsp_data), + .rsp_ready_in (mem_rsp_ready) ); end diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 87d2cc94..ce6a1c96 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -1,8 +1,6 @@ `ifndef VX_CONFIG `define VX_CONFIG -`include "VX_user_config.vh" - `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif @@ -35,8 +33,8 @@ `define SM_ENABLE 1 `endif -`ifndef GLOBAL_BLOCK_SIZE -`define GLOBAL_BLOCK_SIZE 64 +`ifndef MEM_BLOCK_SIZE +`define MEM_BLOCK_SIZE 64 `endif `ifndef L1_BLOCK_SIZE @@ -209,14 +207,14 @@ `define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls `define CSR_MPM_SMEM_BANK_ST_H 12'hB98 // PERF: memory -`define CSR_MPM_DRAM_READS 12'hB19 // dram reads -`define CSR_MPM_DRAM_READS_H 12'hB99 -`define CSR_MPM_DRAM_WRITES 12'hB1A // dram writes -`define CSR_MPM_DRAM_WRITES_H 12'hB9A -`define CSR_MPM_DRAM_ST 12'hB1B // dram request stalls -`define CSR_MPM_DRAM_ST_H 12'hB9B -`define CSR_MPM_DRAM_LAT 12'hB1C // dram latency (total) -`define CSR_MPM_DRAM_LAT_H 12'hB9C +`define CSR_MPM_MEM_READS 12'hB19 // memory reads +`define CSR_MPM_MEM_READS_H 12'hB99 +`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes +`define CSR_MPM_MEM_WRITES_H 12'hB9A +`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls +`define CSR_MPM_MEM_ST_H 12'hB9B +`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total) +`define CSR_MPM_MEM_LAT_H 12'hB9C // Machine Information Registers `define CSR_MVENDORID 12'hF11 @@ -281,14 +279,14 @@ `define IMSHR_SIZE `NUM_WARPS `endif -// DRAM Request Queue Size -`ifndef IDREQ_SIZE -`define IDREQ_SIZE 4 +// Memory Request Queue Size +`ifndef IMREQ_SIZE +`define IMREQ_SIZE 4 `endif -// DRAM Response Queue Size -`ifndef IDRSQ_SIZE -`define IDRSQ_SIZE 4 +// Memory Response Queue Size +`ifndef IMRSQ_SIZE +`define IMRSQ_SIZE 4 `endif // Dcache Configurable Knobs ////////////////////////////////////////////////// @@ -318,14 +316,14 @@ `define DMSHR_SIZE `LSUQ_SIZE `endif -// DRAM Request Queue Size -`ifndef DDREQ_SIZE -`define DDREQ_SIZE 4 +// Memory Request Queue Size +`ifndef DMREQ_SIZE +`define DMREQ_SIZE 4 `endif -// DRAM Response Queue Size -`ifndef DDRSQ_SIZE -`define DDRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2)) +// Memory Response Queue Size +`ifndef DMRSQ_SIZE +`define DMRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2)) `endif // SM Configurable Knobs ////////////////////////////////////////////////////// @@ -372,14 +370,14 @@ `define L2MSHR_SIZE 16 `endif -// DRAM Request Queue Size -`ifndef L2DREQ_SIZE -`define L2DREQ_SIZE 4 +// L2 Request Queue Size +`ifndef L2MREQ_SIZE +`define L2MREQ_SIZE 4 `endif -// DRAM Response Queue Size -`ifndef L2DRSQ_SIZE -`define L2DRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2)) +// L2 Response Queue Size +`ifndef L2MRSQ_SIZE +`define L2MRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2)) `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// @@ -404,14 +402,14 @@ `define L3MSHR_SIZE 16 `endif -// DRAM Request Queue Size -`ifndef L3DREQ_SIZE -`define L3DREQ_SIZE 4 +// L3 Request Queue Size +`ifndef L3MREQ_SIZE +`define L3MREQ_SIZE 4 `endif -// DRAM Response Queue Size -`ifndef L3DRSQ_SIZE -`define L3DRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2)) +// L3 Response Queue Size +`ifndef L3MRSQ_SIZE +`define L3MRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2)) `endif `endif diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 50aad784..47ac1377 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -9,20 +9,20 @@ module VX_core #( input wire clk, input wire reset, - // DRAM request - output wire dram_req_valid, - output wire dram_req_rw, - output wire [`DDRAM_BYTEEN_WIDTH-1:0] dram_req_byteen, - output wire [`DDRAM_ADDR_WIDTH-1:0] dram_req_addr, - output wire [`DDRAM_LINE_WIDTH-1:0] dram_req_data, - output wire [`XDRAM_TAG_WIDTH-1:0] dram_req_tag, - input wire dram_req_ready, + // Memory request + output wire mem_req_valid, + output wire mem_req_rw, + output wire [`DMEM_BYTEEN_WIDTH-1:0] mem_req_byteen, + output wire [`DMEM_ADDR_WIDTH-1:0] mem_req_addr, + output wire [`DMEM_LINE_WIDTH-1:0] mem_req_data, + output wire [`XMEM_TAG_WIDTH-1:0] mem_req_tag, + input wire mem_req_ready, - // DRAM reponse - input wire dram_rsp_valid, - input wire [`DDRAM_LINE_WIDTH-1:0] dram_rsp_data, - input wire [`XDRAM_TAG_WIDTH-1:0] dram_rsp_tag, - output wire dram_rsp_ready, + // Memory reponse + input wire mem_rsp_valid, + input wire [`DMEM_LINE_WIDTH-1:0] mem_rsp_data, + input wire [`XMEM_TAG_WIDTH-1:0] mem_rsp_tag, + output wire mem_rsp_ready, // CSR request input wire csr_req_valid, @@ -44,29 +44,29 @@ module VX_core #( VX_perf_memsys_if perf_memsys_if(); `endif - VX_cache_dram_req_if #( - .DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH), - .DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH), - .DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH) - ) dram_req_if(); + VX_cache_mem_req_if #( + .MEM_LINE_WIDTH(`DMEM_LINE_WIDTH), + .MEM_ADDR_WIDTH(`DMEM_ADDR_WIDTH), + .MEM_TAG_WIDTH(`XMEM_TAG_WIDTH) + ) mem_req_if(); - VX_cache_dram_rsp_if #( - .DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH), - .DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH) - ) dram_rsp_if(); + VX_cache_mem_rsp_if #( + .MEM_LINE_WIDTH(`DMEM_LINE_WIDTH), + .MEM_TAG_WIDTH(`XMEM_TAG_WIDTH) + ) mem_rsp_if(); - assign dram_req_valid = dram_req_if.valid; - assign dram_req_rw = dram_req_if.rw; - assign dram_req_byteen= dram_req_if.byteen; - assign dram_req_addr = dram_req_if.addr; - assign dram_req_data = dram_req_if.data; - assign dram_req_tag = dram_req_if.tag; - assign dram_req_if.ready = dram_req_ready; + assign mem_req_valid = mem_req_if.valid; + assign mem_req_rw = mem_req_if.rw; + assign mem_req_byteen= mem_req_if.byteen; + assign mem_req_addr = mem_req_if.addr; + assign mem_req_data = mem_req_if.data; + assign mem_req_tag = mem_req_if.tag; + assign mem_req_if.ready = mem_req_ready; - assign dram_rsp_if.valid = dram_rsp_valid; - assign dram_rsp_if.data = dram_rsp_data; - assign dram_rsp_if.tag = dram_rsp_tag; - assign dram_rsp_ready = dram_rsp_if.ready; + assign mem_rsp_if.valid = mem_rsp_valid; + assign mem_rsp_if.data = mem_rsp_data; + assign mem_rsp_if.tag = mem_rsp_tag; + assign mem_rsp_ready = mem_rsp_if.ready; //-- @@ -168,9 +168,9 @@ module VX_core #( .icache_core_req_if (icache_core_req_if), .icache_core_rsp_if (icache_core_rsp_if), - // DRAM - .dram_req_if (dram_req_if), - .dram_rsp_if (dram_rsp_if) + // Memory + .mem_req_if (mem_req_if), + .mem_rsp_if (mem_rsp_if) ); endmodule diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index dc9dc7e8..d0007c7a 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -123,61 +123,61 @@ module VX_csr_data #( `ifdef PERF_ENABLE // PERF: pipeline `CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0]; - `CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[43:32]); + `CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0]; - `CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[43:32]); + `CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0]; - `CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[43:32]); + `CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0]; - `CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[43:32]); + `CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; - `CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[43:32]); + `CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; - `CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[43:32]); + `CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; - `CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[43:32]); + `CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]); // PERF: icache `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0]; - `CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[43:32]); + `CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; - `CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[43:32]); + `CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; - `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[43:32]); + `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; - `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[43:32]); + `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]); // PERF: dcache `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0]; - `CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[43:32]); + `CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0]; - `CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[43:32]); + `CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0]; - `CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[43:32]); + `CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0]; - `CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[43:32]); + `CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0]; - `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[43:32]); + `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; - `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[43:32]); + `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; - `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[43:32]); + `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; - `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[43:32]); + `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]); // PERF: smem `CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0]; - `CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[43:32]); + `CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0]; - `CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[43:32]); + `CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0]; - `CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[43:32]); - // PERF: DRAM - `CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0]; - `CSR_MPM_DRAM_READS_H : read_data_r = 32'(perf_memsys_if.dram_reads[43:32]); - `CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0]; - `CSR_MPM_DRAM_WRITES_H : read_data_r = 32'(perf_memsys_if.dram_writes[43:32]); - `CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0]; - `CSR_MPM_DRAM_ST_H : read_data_r = 32'(perf_memsys_if.dram_stalls[43:32]); - `CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0]; - `CSR_MPM_DRAM_LAT_H : read_data_r = 32'(perf_memsys_if.dram_latency[43:32]); + `CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]); + // PERF: MEM + `CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0]; + `CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0]; + `CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]); + `CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0]; + `CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]); + `CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0]; + `CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]); `endif `CSR_SATP : read_data_r = 32'(csr_satp); @@ -195,9 +195,9 @@ module VX_csr_data #( `CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]); `CSR_CYCLE : read_data_r = csr_cycle[31:0]; - `CSR_CYCLE_H : read_data_r = 32'(csr_cycle[43:32]); + `CSR_CYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]); `CSR_INSTRET : read_data_r = csr_instret[31:0]; - `CSR_INSTRET_H : read_data_r = 32'(csr_instret[43:32]); + `CSR_INSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]); `CSR_MVENDORID : read_data_r = `VENDOR_ID; `CSR_MARCHID : read_data_r = `ARCHITECTURE_ID; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index a8043923..149d8ad3 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -30,6 +30,8 @@ `define CSR_WIDTH 12 +`define PERF_CTR_BITS 44 + /////////////////////////////////////////////////////////////////////////////// `define INST_LUI 7'b0110111 @@ -244,7 +246,7 @@ `define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) // Block size in bytes -`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) +`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE) // Word size in bytes `define IWORD_SIZE 4 @@ -264,11 +266,11 @@ // Core request tag bits `define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS) -// DRAM request data bits -`define IDRAM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8) +// Memory request data bits +`define IMEM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8) -// DRAM byte enable bits -`define IDRAM_BYTEEN_WIDTH `ICACHE_LINE_SIZE +// Memory byte enable bits +`define IMEM_BYTEEN_WIDTH `ICACHE_LINE_SIZE ////////////////////////// Dcache Configurable Knobs ////////////////////////// @@ -276,7 +278,7 @@ `define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) // Block size in bytes -`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) +`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE) // Word size in bytes `define DWORD_SIZE 4 @@ -299,14 +301,14 @@ // DRAM request data bits `define DDRAM_LINE_WIDTH (`DCACHE_LINE_SIZE * 8) -// DRAM request address bits -`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE)) +// Memory request address bits +`define DMEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE)) -// DRAM byte enable bits -`define DDRAM_BYTEEN_WIDTH `DCACHE_LINE_SIZE +// Memory byte enable bits +`define DMEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE -// DRAM request tag bits -`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH +// Memory request tag bits +`define DMEM_TAG_WIDTH `DMEM_ADDR_WIDTH // Core request size `define DNUM_REQUESTS `NUM_THREADS @@ -334,7 +336,7 @@ `define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID) // Block size in bytes -`define L2CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE +`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE // Word size in bytes `define L2WORD_SIZE `DCACHE_LINE_SIZE @@ -342,17 +344,17 @@ // Core request tag bits `define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) -// DRAM request data bits -`define L2DRAM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8) +// Memory request data bits +`define L2MEM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8) -// DRAM request address bits -`define L2DRAM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE)) +// Memory request address bits +`define L2MEM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE)) -// DRAM byte enable bits -`define L2DRAM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE +// Memory byte enable bits +`define L2MEM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE -// DRAM request tag bits -`define L2DRAM_TAG_WIDTH (`L2_ENABLE ? `L2DRAM_ADDR_WIDTH : (`XDRAM_TAG_WIDTH+`CLOG2(`NUM_CORES))) +// Memory request tag bits +`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `L2MEM_ADDR_WIDTH : (`XMEM_TAG_WIDTH+`CLOG2(`NUM_CORES))) ////////////////////////// L3cache Configurable Knobs ///////////////////////// @@ -360,7 +362,7 @@ `define L3CACHE_ID 0 // Block size in bytes -`define L3CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE +`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE // Word size in bytes `define L3WORD_SIZE `L2CACHE_LINE_SIZE @@ -368,30 +370,30 @@ // Core request tag bits `define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS)) -// DRAM request data bits -`define L3DRAM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8) +// Memory request data bits +`define L3MEM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8) -// DRAM request address bits -`define L3DRAM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE)) +// Memory request address bits +`define L3MEM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE)) -// DRAM byte enable bits -`define L3DRAM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE +// Memory byte enable bits +`define L3MEM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE -// DRAM request tag bits -`define L3DRAM_TAG_WIDTH (`L3_ENABLE ? `L3DRAM_ADDR_WIDTH : (`L2DRAM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS))) +// Memory request tag bits +`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `L3MEM_ADDR_WIDTH : (`L2MEM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS))) /////////////////////////////////////////////////////////////////////////////// -`define VX_DRAM_BYTEEN_WIDTH `L3DRAM_BYTEEN_WIDTH -`define VX_DRAM_ADDR_WIDTH `L3DRAM_ADDR_WIDTH -`define VX_DRAM_LINE_WIDTH `L3DRAM_LINE_WIDTH -`define VX_DRAM_TAG_WIDTH `L3DRAM_TAG_WIDTH +`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH +`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH +`define VX_MEM_LINE_WIDTH `L3MEM_LINE_WIDTH +`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH `define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH `define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES) `define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)} -`define XDRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH+`CLOG2(2)) +`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH+`CLOG2(2)) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 9de55ab2..8a7bb4aa 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -7,7 +7,6 @@ module VX_ibuffer #( input wire reset, // inputs - input wire freeze, // keep current warp VX_decode_if ibuf_enq_if, // outputs @@ -117,18 +116,9 @@ module VX_ibuffer #( deq_valid_n = 0; deq_wid_n = 'x; deq_instr_n = 'x; - schedule_table_n = 'x; - - if ((0 == num_warps) - || (1 == num_warps && deq_fire && q_alm_empty[deq_wid])) begin - deq_valid_n = enq_fire; - deq_wid_n = ibuf_enq_if.wid; - deq_instr_n = q_data_in; - end else if ((1 == num_warps) || freeze) begin - deq_valid_n = 1; - deq_wid_n = deq_wid; - deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid]; - end else begin + schedule_table_n = 'x; + + if (num_warps > 1) begin deq_valid_n = (| schedule_table); schedule_table_n = schedule_table; for (integer i = 0; i < `NUM_WARPS; i++) begin @@ -139,6 +129,14 @@ module VX_ibuffer #( break; end end + end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin + deq_valid_n = 1; + deq_wid_n = deq_wid; + deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid]; + end else begin + deq_valid_n = enq_fire; + deq_wid_n = ibuf_enq_if.wid; + deq_instr_n = q_data_in; end end diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 5f50e106..f1d6a09b 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -33,7 +33,6 @@ module VX_issue #( ) ibuffer ( .clk (clk), .reset (reset), - .freeze (1'b0), .ibuf_enq_if (decode_if), .ibuf_deq_if (ibuf_deq_if) ); @@ -121,14 +120,14 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_eop, writeback_if.eop); `ifdef PERF_ENABLE - reg [43:0] perf_ibf_stalls; - reg [43:0] perf_scb_stalls; - reg [43:0] perf_alu_stalls; - reg [43:0] perf_lsu_stalls; - reg [43:0] perf_csr_stalls; - reg [43:0] perf_gpu_stalls; + reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; + reg [`PERF_CTR_BITS-1:0] perf_scb_stalls; + reg [`PERF_CTR_BITS-1:0] perf_alu_stalls; + reg [`PERF_CTR_BITS-1:0] perf_lsu_stalls; + reg [`PERF_CTR_BITS-1:0] perf_csr_stalls; + reg [`PERF_CTR_BITS-1:0] perf_gpu_stalls; `ifdef EXT_F_ENABLE - reg [43:0] perf_fpu_stalls; + reg [`PERF_CTR_BITS-1:0] perf_fpu_stalls; `endif always @(posedge clk) begin @@ -144,26 +143,26 @@ module VX_issue #( `endif end else begin if (decode_if.valid & !decode_if.ready) begin - perf_ibf_stalls <= perf_ibf_stalls + 44'd1; + perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; end if (ibuf_deq_if.valid & scoreboard_delay) begin - perf_scb_stalls <= perf_scb_stalls + 44'd1; + perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; end if (alu_req_if.valid & !alu_req_if.ready) begin - perf_alu_stalls <= perf_alu_stalls + 44'd1; + perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; end if (lsu_req_if.valid & !lsu_req_if.ready) begin - perf_lsu_stalls <= perf_lsu_stalls + 44'd1; + perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; end if (csr_req_if.valid & !csr_req_if.ready) begin - perf_csr_stalls <= perf_csr_stalls + 44'd1; + perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; end if (gpu_req_if.valid & !gpu_req_if.ready) begin - perf_gpu_stalls <= perf_gpu_stalls + 44'd1; + perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; end `ifdef EXT_F_ENABLE if (fpu_req_if.valid & !fpu_req_if.ready) begin - perf_fpu_stalls <= perf_fpu_stalls + 44'd1; + perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; end `endif end diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index bf5f309c..594d58a7 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -44,10 +44,6 @@ module VX_lsu_unit #( end wire is_dup_load = lsu_req_if.wb && lsu_req_if.tmask[0] && (& addr_matches); -`IGNORE_WARNINGS_BEGIN - reg [`LSUQ_SIZE-1:0][`LSUQ_ADDR_BITS-1:0] pending_tags; -`IGNORE_WARNINGS_END - wire ready_in; wire stall_in = ~ready_in && req_valid; @@ -79,7 +75,7 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] rsp_tmask; reg [`NUM_THREADS-1:0] req_sent_mask; - wire sent_all_ready; + wire req_ready_all; wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; @@ -118,13 +114,7 @@ module VX_lsu_unit #( `UNUSED_PIN (empty) ); - always @(posedge clk) begin - if (mbuf_push) begin - pending_tags[mbuf_waddr] <= req_tag; - end - end - - assign sent_all_ready = &(dcache_req_if.ready | req_sent_mask); + assign req_ready_all = &(dcache_req_if.ready | req_sent_mask | ~req_tmask); wire [`NUM_THREADS-1:0] req_sent_dup = {{(`NUM_THREADS-1){dcache_req_fire[0] && req_is_dup}}, 1'b0}; @@ -132,19 +122,22 @@ module VX_lsu_unit #( if (reset) begin req_sent_mask <= 0; end else begin - if (sent_all_ready) + if (req_ready_all) req_sent_mask <= 0; else req_sent_mask <= req_sent_mask | dcache_req_fire | req_sent_dup; end end + wire is_req_start = (0 == req_sent_mask); + // need to hold the acquired tag index until the full request is submitted - reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold; - wire [`LSUQ_ADDR_BITS-1:0] req_tag = (0 == req_sent_mask) ? mbuf_waddr : req_tag_hold; + reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold; + wire [`DCORE_TAG_ID_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold; always @(posedge clk) begin - if (mbuf_push) + if (mbuf_push) begin req_tag_hold <= mbuf_waddr; + end end wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; @@ -160,7 +153,8 @@ module VX_lsu_unit #( end end - wire req_ready_dep = (req_wb && ~mbuf_full) + // ensure all dependencies for the requests are resolved + wire req_dep_ready = (req_wb && (~mbuf_full || ~is_req_start)) || (~req_wb && st_commit_if.ready); // DCache Request @@ -193,7 +187,7 @@ module VX_lsu_unit #( end end - assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask_dup & ~req_sent_mask; + assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_dep_ready}} & req_tmask_dup & ~req_sent_mask; assign dcache_req_if.rw = {`NUM_THREADS{~req_wb}}; assign dcache_req_if.addr = mem_req_addr; assign dcache_req_if.byteen = mem_req_byteen; @@ -205,11 +199,11 @@ module VX_lsu_unit #( assign dcache_req_if.tag = {`NUM_THREADS{req_tag}}; `endif - assign ready_in = req_ready_dep && sent_all_ready; + assign ready_in = req_dep_ready && req_ready_all; // send store commit - wire is_store_rsp = req_valid && ~req_wb && sent_all_ready; + wire is_store_rsp = req_valid && ~req_wb && req_ready_all; assign st_commit_if.valid = is_store_rsp; assign st_commit_if.wid = req_wid; @@ -280,23 +274,46 @@ module VX_lsu_unit #( `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifdef DBG_PRINT_CORE_DCACHE +`IGNORE_WARNINGS_BEGIN + reg [`LSUQ_SIZE-1:0][`DCORE_TAG_WIDTH:0] pending_reqs; +`IGNORE_WARNINGS_END + + always @(posedge clk) begin + if (reset) begin + pending_reqs <= '0; + end else if (mbuf_push) begin + pending_reqs[mbuf_waddr] <= {dcache_req_if.tag[0], 1'b1}; + end else if (mbuf_pop) begin + pending_reqs[mbuf_raddr] <= '0; + end + end + always @(posedge clk) begin if ((| dcache_req_fire)) begin - if ((| dcache_req_if.rw)) - $display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h", - $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); - else - $display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d, is_dup=%b", - $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, req_rd, req_is_dup); + if (dcache_req_if.rw[0]) begin + $write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); + `PRINT_ARRAY1D(req_addr, `NUM_THREADS); + $write(", tag=%0h, byteen=%0h, data=", dcache_req_if.tag[0], dcache_req_if.byteen); + `PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS); + $write("\n"); + end else begin + $write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); + `PRINT_ARRAY1D(req_addr, `NUM_THREADS); + $write(", tag=%0h, byteen=%0h, rd=%0d, is_dup=%b\n", dcache_req_if.tag[0], dcache_req_if.byteen, req_rd, req_is_dup); + end end if (dcache_rsp_fire) begin - $display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h, is_dup=%b", - $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data, rsp_is_dup); + $write("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=", + $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd); + `PRINT_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); + $write(", is_dup=%b\n", rsp_is_dup); end if (mbuf_full) begin - $write("%t: D$%0d queue-full:", $time, CORE_ID); + $write("%t: *** D$%0d queue-full:", $time, CORE_ID); for (integer j = 0; j < `LSUQ_SIZE; j++) begin - $write(" tag%0d=%0h", j, pending_tags[j]); + if (pending_reqs[j][0]) begin + $write(" %0d->%0h", j, pending_reqs[j][1 +: `DCORE_TAG_WIDTH]); + end end $write("\n"); end diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 8b5c3f2d..7fa51c82 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -20,25 +20,25 @@ module VX_mem_unit # ( VX_icache_core_req_if icache_core_req_if, VX_icache_core_rsp_if icache_core_rsp_if, - // DRAM - VX_cache_dram_req_if dram_req_if, - VX_cache_dram_rsp_if dram_rsp_if + // Memory + VX_cache_mem_req_if mem_req_if, + VX_cache_mem_rsp_if mem_rsp_if ); `ifdef PERF_ENABLE VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if(); `endif - VX_cache_dram_req_if #( - .DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH), - .DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), - .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH) - ) dcache_dram_req_if(), icache_dram_req_if(); + VX_cache_mem_req_if #( + .MEM_LINE_WIDTH (`DMEM_LINE_WIDTH), + .MEM_ADDR_WIDTH (`DMEM_ADDR_WIDTH), + .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH) + ) dcache_mem_req_if(), icache_mem_req_if(); - VX_cache_dram_rsp_if #( - .DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH), - .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH) - ) dcache_dram_rsp_if(), icache_dram_rsp_if(); + VX_cache_mem_rsp_if #( + .MEM_LINE_WIDTH (`DMEM_LINE_WIDTH), + .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH) + ) dcache_mem_rsp_if(), icache_mem_rsp_if(); VX_dcache_core_req_if #( .LANES (`DNUM_REQUESTS), @@ -96,12 +96,12 @@ module VX_mem_unit # ( .NUM_REQS (1), .CREQ_SIZE (`ICREQ_SIZE), .MSHR_SIZE (`IMSHR_SIZE), - .DRSQ_SIZE (`IDRSQ_SIZE), - .DREQ_SIZE (`IDREQ_SIZE), + .MRSQ_SIZE (`IMRSQ_SIZE), + .MREQ_SIZE (`IMREQ_SIZE), .WRITE_ENABLE (0), .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), - .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH) + .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH) ) icache ( `SCOPE_BIND_VX_mem_unit_icache @@ -129,20 +129,20 @@ module VX_mem_unit # ( .perf_cache_if (perf_icache_if), `endif - // DRAM Req - .dram_req_valid (icache_dram_req_if.valid), - .dram_req_rw (icache_dram_req_if.rw), - .dram_req_byteen (icache_dram_req_if.byteen), - .dram_req_addr (icache_dram_req_if.addr), - .dram_req_data (icache_dram_req_if.data), - .dram_req_tag (icache_dram_req_if.tag), - .dram_req_ready (icache_dram_req_if.ready), + // Memory Request + .mem_req_valid (icache_mem_req_if.valid), + .mem_req_rw (icache_mem_req_if.rw), + .mem_req_byteen (icache_mem_req_if.byteen), + .mem_req_addr (icache_mem_req_if.addr), + .mem_req_data (icache_mem_req_if.data), + .mem_req_tag (icache_mem_req_if.tag), + .mem_req_ready (icache_mem_req_if.ready), - // DRAM response - .dram_rsp_valid (icache_dram_rsp_if.valid), - .dram_rsp_data (icache_dram_rsp_if.data), - .dram_rsp_tag (icache_dram_rsp_if.tag), - .dram_rsp_ready (icache_dram_rsp_if.ready) + // Memory response + .mem_rsp_valid (icache_mem_rsp_if.valid), + .mem_rsp_data (icache_mem_rsp_if.data), + .mem_rsp_tag (icache_mem_rsp_if.tag), + .mem_rsp_ready (icache_mem_rsp_if.ready) ); VX_cache #( @@ -155,12 +155,12 @@ module VX_mem_unit # ( .NUM_REQS (`DNUM_REQUESTS), .CREQ_SIZE (`DCREQ_SIZE), .MSHR_SIZE (`DMSHR_SIZE), - .DRSQ_SIZE (`DDRSQ_SIZE), - .DREQ_SIZE (`DDREQ_SIZE), + .MRSQ_SIZE (`DMRSQ_SIZE), + .MREQ_SIZE (`DMREQ_SIZE), .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), - .DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH) + .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH) ) dcache ( `SCOPE_BIND_VX_mem_unit_dcache @@ -188,20 +188,20 @@ module VX_mem_unit # ( .perf_cache_if (perf_dcache_if), `endif - // DRAM request - .dram_req_valid (dcache_dram_req_if.valid), - .dram_req_rw (dcache_dram_req_if.rw), - .dram_req_byteen (dcache_dram_req_if.byteen), - .dram_req_addr (dcache_dram_req_if.addr), - .dram_req_data (dcache_dram_req_if.data), - .dram_req_tag (dcache_dram_req_if.tag), - .dram_req_ready (dcache_dram_req_if.ready), + // Memory request + .mem_req_valid (dcache_mem_req_if.valid), + .mem_req_rw (dcache_mem_req_if.rw), + .mem_req_byteen (dcache_mem_req_if.byteen), + .mem_req_addr (dcache_mem_req_if.addr), + .mem_req_data (dcache_mem_req_if.data), + .mem_req_tag (dcache_mem_req_if.tag), + .mem_req_ready (dcache_mem_req_if.ready), - // DRAM response - .dram_rsp_valid (dcache_dram_rsp_if.valid), - .dram_rsp_data (dcache_dram_rsp_if.data), - .dram_rsp_tag (dcache_dram_rsp_if.tag), - .dram_rsp_ready (dcache_dram_rsp_if.ready) + // Memory response + .mem_rsp_valid (dcache_mem_rsp_if.valid), + .mem_rsp_data (dcache_mem_rsp_if.data), + .mem_rsp_tag (dcache_mem_rsp_if.tag), + .mem_rsp_ready (dcache_mem_rsp_if.ready) ); if (`SM_ENABLE) begin @@ -252,45 +252,45 @@ module VX_mem_unit # ( VX_mem_arb #( .NUM_REQS (2), - .DATA_WIDTH (`DDRAM_LINE_WIDTH), - .ADDR_WIDTH (`DDRAM_ADDR_WIDTH), - .TAG_IN_WIDTH (`DDRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`XDRAM_TAG_WIDTH), + .DATA_WIDTH (`DMEM_LINE_WIDTH), + .ADDR_WIDTH (`DMEM_ADDR_WIDTH), + .TAG_IN_WIDTH (`DMEM_TAG_WIDTH), + .TAG_OUT_WIDTH (`XMEM_TAG_WIDTH), .BUFFERED_REQ (1), .BUFFERED_RSP (0) - ) dram_arb ( + ) mem_arb ( .clk (clk), .reset (reset), // Source request - .req_valid_in ({dcache_dram_req_if.valid, icache_dram_req_if.valid}), - .req_rw_in ({dcache_dram_req_if.rw, icache_dram_req_if.rw}), - .req_byteen_in ({dcache_dram_req_if.byteen, icache_dram_req_if.byteen}), - .req_addr_in ({dcache_dram_req_if.addr, icache_dram_req_if.addr}), - .req_data_in ({dcache_dram_req_if.data, icache_dram_req_if.data}), - .req_tag_in ({dcache_dram_req_if.tag, icache_dram_req_if.tag}), - .req_ready_in ({dcache_dram_req_if.ready, icache_dram_req_if.ready}), + .req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}), + .req_rw_in ({dcache_mem_req_if.rw, icache_mem_req_if.rw}), + .req_byteen_in ({dcache_mem_req_if.byteen, icache_mem_req_if.byteen}), + .req_addr_in ({dcache_mem_req_if.addr, icache_mem_req_if.addr}), + .req_data_in ({dcache_mem_req_if.data, icache_mem_req_if.data}), + .req_tag_in ({dcache_mem_req_if.tag, icache_mem_req_if.tag}), + .req_ready_in ({dcache_mem_req_if.ready, icache_mem_req_if.ready}), - // DRAM request - .req_valid_out (dram_req_if.valid), - .req_rw_out (dram_req_if.rw), - .req_byteen_out (dram_req_if.byteen), - .req_addr_out (dram_req_if.addr), - .req_data_out (dram_req_if.data), - .req_tag_out (dram_req_if.tag), - .req_ready_out (dram_req_if.ready), + // Memory request + .req_valid_out (mem_req_if.valid), + .req_rw_out (mem_req_if.rw), + .req_byteen_out (mem_req_if.byteen), + .req_addr_out (mem_req_if.addr), + .req_data_out (mem_req_if.data), + .req_tag_out (mem_req_if.tag), + .req_ready_out (mem_req_if.ready), // Source response - .rsp_valid_out ({dcache_dram_rsp_if.valid, icache_dram_rsp_if.valid}), - .rsp_data_out ({dcache_dram_rsp_if.data, icache_dram_rsp_if.data}), - .rsp_tag_out ({dcache_dram_rsp_if.tag, icache_dram_rsp_if.tag}), - .rsp_ready_out ({dcache_dram_rsp_if.ready, icache_dram_rsp_if.ready}), + .rsp_valid_out ({dcache_mem_rsp_if.valid, icache_mem_rsp_if.valid}), + .rsp_data_out ({dcache_mem_rsp_if.data, icache_mem_rsp_if.data}), + .rsp_tag_out ({dcache_mem_rsp_if.tag, icache_mem_rsp_if.tag}), + .rsp_ready_out ({dcache_mem_rsp_if.ready, icache_mem_rsp_if.ready}), - // DRAM response - .rsp_valid_in (dram_rsp_if.valid), - .rsp_tag_in (dram_rsp_if.tag), - .rsp_data_in (dram_rsp_if.data), - .rsp_ready_in (dram_rsp_if.ready) + // Memory response + .rsp_valid_in (mem_rsp_if.valid), + .rsp_tag_in (mem_rsp_if.tag), + .rsp_data_in (mem_rsp_if.data), + .rsp_ready_in (mem_rsp_if.ready) ); `ifdef PERF_ENABLE @@ -319,47 +319,47 @@ end else begin assign perf_memsys_if.smem_bank_stalls = 0; end - reg [43:0] perf_dram_lat_per_cycle; + reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle; always @(posedge clk) begin if (reset) begin - perf_dram_lat_per_cycle <= 0; + perf_mem_lat_per_cycle <= 0; end else begin - perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + - 44'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) - - 2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready)))); + perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle + + `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - + 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready)))); end end - reg [43:0] perf_dram_reads; - reg [43:0] perf_dram_writes; - reg [43:0] perf_dram_lat; - reg [43:0] perf_dram_stalls; + reg [`PERF_CTR_BITS-1:0] perf_mem_reads; + reg [`PERF_CTR_BITS-1:0] perf_mem_writes; + reg [`PERF_CTR_BITS-1:0] perf_mem_lat; + reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; always @(posedge clk) begin if (reset) begin - perf_dram_reads <= 0; - perf_dram_writes <= 0; - perf_dram_lat <= 0; - perf_dram_stalls <= 0; + perf_mem_reads <= 0; + perf_mem_writes <= 0; + perf_mem_lat <= 0; + perf_mem_stalls <= 0; end else begin - if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin - perf_dram_reads <= perf_dram_reads + 44'd1; + if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin + perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1; end - if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin - perf_dram_writes <= perf_dram_writes + 44'd1; + if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin + perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1; end - if (dram_req_if.valid && !dram_req_if.ready) begin - perf_dram_stalls <= perf_dram_stalls + 44'd1; + if (mem_req_if.valid && !mem_req_if.ready) begin + perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1; end - perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle; + perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle; end end - assign perf_memsys_if.dram_reads = perf_dram_reads; - assign perf_memsys_if.dram_writes = perf_dram_writes; - assign perf_memsys_if.dram_latency = perf_dram_lat; - assign perf_memsys_if.dram_stalls = perf_dram_stalls; + assign perf_memsys_if.mem_reads = perf_mem_reads; + assign perf_memsys_if.mem_writes = perf_mem_writes; + assign perf_memsys_if.mem_latency = perf_mem_lat; + assign perf_memsys_if.mem_stalls = perf_mem_stalls; `endif endmodule diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index d768d9dd..2e0512c3 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -70,6 +70,8 @@ `define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1) `define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) +`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : x); + `define MIN(x, y) ((x < y) ? (x) : (y)) `define MAX(x, y) ((x > y) ? (x) : (y)) diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index a1cc2078..139f0aa1 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -31,7 +31,7 @@ module VX_scoreboard #( if (release_reg) begin inuse_regs[writeback_if.wid][writeback_if.rd] <= 0; assert(inuse_regs[writeback_if.wid][writeback_if.rd] != 0) - else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", + else $error("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd); end end @@ -40,7 +40,7 @@ module VX_scoreboard #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin - $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", + $display("%t: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]); end @@ -54,7 +54,7 @@ module VX_scoreboard #( deadlock_ctr <= 0; end else if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin deadlock_ctr <= deadlock_ctr + 1; - assert(deadlock_ctr < deadlock_timeout) else $error("*** %t: core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", + assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]); end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin diff --git a/hw/rtl/VX_smem_arb.v b/hw/rtl/VX_smem_arb.v index 89316a51..bfe8ec43 100644 --- a/hw/rtl/VX_smem_arb.v +++ b/hw/rtl/VX_smem_arb.v @@ -34,7 +34,7 @@ module VX_smem_arb ( wire is_smem_addr_in, is_smem_addr_out; // select shared memory bus - assign is_smem_addr_in = core_req_if.valid[i] && `SM_ENABLE + assign is_smem_addr_in = `SM_ENABLE && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); @@ -51,13 +51,13 @@ module VX_smem_arb ( .ready_out (cache_req_ready_out) ); - if (`SM_ENABLE ) begin + if (`SM_ENABLE) begin assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out; assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out; assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i]; assign smem_req_if.addr[i] = cache_req_if.addr[i]; - assign smem_req_if.rw[i] = cache_req_if.rw[i]; + assign smem_req_if.rw[i] = cache_req_if.rw[i]; assign smem_req_if.byteen[i] = cache_req_if.byteen[i]; assign smem_req_if.data[i] = cache_req_if.data[i]; assign smem_req_if.tag[i] = cache_req_if.tag[i]; diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 2f2d34e9..15a1def4 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -7,20 +7,20 @@ module Vortex ( input wire clk, input wire reset, - // DRAM request - output wire dram_req_valid, - output wire dram_req_rw, - output wire [`VX_DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen, - output wire [`VX_DRAM_ADDR_WIDTH-1:0] dram_req_addr, - output wire [`VX_DRAM_LINE_WIDTH-1:0] dram_req_data, - output wire [`VX_DRAM_TAG_WIDTH-1:0] dram_req_tag, - input wire dram_req_ready, + // Memory request + output wire mem_req_valid, + output wire mem_req_rw, + output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, + output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr, + output wire [`VX_MEM_LINE_WIDTH-1:0] mem_req_data, + output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag, + input wire mem_req_ready, - // DRAM response - input wire dram_rsp_valid, - input wire [`VX_DRAM_LINE_WIDTH-1:0] dram_rsp_data, - input wire [`VX_DRAM_TAG_WIDTH-1:0] dram_rsp_tag, - output wire dram_rsp_ready, + // Memory response + input wire mem_rsp_valid, + input wire [`VX_MEM_LINE_WIDTH-1:0] mem_rsp_data, + input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag, + output wire mem_rsp_ready, // CSR Request input wire csr_req_valid, @@ -40,18 +40,18 @@ module Vortex ( output wire ebreak ); - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_ready; + wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid; + wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw; + wire [`NUM_CLUSTERS-1:0][`L2MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen; + wire [`NUM_CLUSTERS-1:0][`L2MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr; + wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_req_data; + wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready; + wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid; + wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_rsp_data; + wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag; + wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready; wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_valid; wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_req_addr; @@ -88,18 +88,18 @@ module Vortex ( .clk (clk), .reset (cluster_reset), - .dram_req_valid (per_cluster_dram_req_valid [i]), - .dram_req_rw (per_cluster_dram_req_rw [i]), - .dram_req_byteen(per_cluster_dram_req_byteen[i]), - .dram_req_addr (per_cluster_dram_req_addr [i]), - .dram_req_data (per_cluster_dram_req_data [i]), - .dram_req_tag (per_cluster_dram_req_tag [i]), - .dram_req_ready (per_cluster_dram_req_ready [i]), + .mem_req_valid (per_cluster_mem_req_valid [i]), + .mem_req_rw (per_cluster_mem_req_rw [i]), + .mem_req_byteen (per_cluster_mem_req_byteen[i]), + .mem_req_addr (per_cluster_mem_req_addr [i]), + .mem_req_data (per_cluster_mem_req_data [i]), + .mem_req_tag (per_cluster_mem_req_tag [i]), + .mem_req_ready (per_cluster_mem_req_ready [i]), - .dram_rsp_valid (per_cluster_dram_rsp_valid [i]), - .dram_rsp_data (per_cluster_dram_rsp_data [i]), - .dram_rsp_tag (per_cluster_dram_rsp_tag [i]), - .dram_rsp_ready (per_cluster_dram_rsp_ready [i]), + .mem_rsp_valid (per_cluster_mem_rsp_valid [i]), + .mem_rsp_data (per_cluster_mem_rsp_data [i]), + .mem_rsp_tag (per_cluster_mem_rsp_tag [i]), + .mem_rsp_ready (per_cluster_mem_rsp_ready [i]), .csr_req_valid (per_cluster_csr_req_valid [i]), .csr_req_coreid (csr_core_id), @@ -171,12 +171,12 @@ module Vortex ( .NUM_REQS (`NUM_CLUSTERS), .CREQ_SIZE (`L3CREQ_SIZE), .MSHR_SIZE (`L3MSHR_SIZE), - .DRSQ_SIZE (`L3DRSQ_SIZE), - .DREQ_SIZE (`L3DREQ_SIZE), + .MRSQ_SIZE (`L3MRSQ_SIZE), + .MREQ_SIZE (`L3MREQ_SIZE), .WRITE_ENABLE (1), - .CORE_TAG_WIDTH (`L2DRAM_TAG_WIDTH), + .CORE_TAG_WIDTH (`L2MEM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), - .DRAM_TAG_WIDTH (`L3DRAM_TAG_WIDTH) + .MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH) ) l3cache ( `SCOPE_BIND_Vortex_l3cache @@ -190,105 +190,105 @@ module Vortex ( `endif // Core request - .core_req_valid (per_cluster_dram_req_valid), - .core_req_rw (per_cluster_dram_req_rw), - .core_req_byteen (per_cluster_dram_req_byteen), - .core_req_addr (per_cluster_dram_req_addr), - .core_req_data (per_cluster_dram_req_data), - .core_req_tag (per_cluster_dram_req_tag), - .core_req_ready (per_cluster_dram_req_ready), + .core_req_valid (per_cluster_mem_req_valid), + .core_req_rw (per_cluster_mem_req_rw), + .core_req_byteen (per_cluster_mem_req_byteen), + .core_req_addr (per_cluster_mem_req_addr), + .core_req_data (per_cluster_mem_req_data), + .core_req_tag (per_cluster_mem_req_tag), + .core_req_ready (per_cluster_mem_req_ready), // Core response - .core_rsp_valid (per_cluster_dram_rsp_valid), - .core_rsp_data (per_cluster_dram_rsp_data), - .core_rsp_tag (per_cluster_dram_rsp_tag), - .core_rsp_ready (per_cluster_dram_rsp_ready), + .core_rsp_valid (per_cluster_mem_rsp_valid), + .core_rsp_data (per_cluster_mem_rsp_data), + .core_rsp_tag (per_cluster_mem_rsp_tag), + .core_rsp_ready (per_cluster_mem_rsp_ready), - // DRAM request - .dram_req_valid (dram_req_valid), - .dram_req_rw (dram_req_rw), - .dram_req_byteen (dram_req_byteen), - .dram_req_addr (dram_req_addr), - .dram_req_data (dram_req_data), - .dram_req_tag (dram_req_tag), - .dram_req_ready (dram_req_ready), + // Memory request + .mem_req_valid (mem_req_valid), + .mem_req_rw (mem_req_rw), + .mem_req_byteen (mem_req_byteen), + .mem_req_addr (mem_req_addr), + .mem_req_data (mem_req_data), + .mem_req_tag (mem_req_tag), + .mem_req_ready (mem_req_ready), - // DRAM response - .dram_rsp_valid (dram_rsp_valid), - .dram_rsp_data (dram_rsp_data), - .dram_rsp_tag (dram_rsp_tag), - .dram_rsp_ready (dram_rsp_ready) + // Memory response + .mem_rsp_valid (mem_rsp_valid), + .mem_rsp_data (mem_rsp_data), + .mem_rsp_tag (mem_rsp_tag), + .mem_rsp_ready (mem_rsp_ready) ); end else begin VX_mem_arb #( - .NUM_REQS (`NUM_CLUSTERS), - .DATA_WIDTH (`L3DRAM_LINE_WIDTH), - .TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH), - .BUFFERED_REQ (1), - .BUFFERED_RSP (1) - ) dram_arb ( + .NUM_REQS (`NUM_CLUSTERS), + .DATA_WIDTH (`L3MEM_LINE_WIDTH), + .TAG_IN_WIDTH (`L2MEM_TAG_WIDTH), + .TAG_OUT_WIDTH (`L3MEM_TAG_WIDTH), + .BUFFERED_REQ (1), + .BUFFERED_RSP (1) + ) mem_arb ( .clk (clk), .reset (reset), // Core request - .req_valid_in (per_cluster_dram_req_valid), - .req_rw_in (per_cluster_dram_req_rw), - .req_byteen_in (per_cluster_dram_req_byteen), - .req_addr_in (per_cluster_dram_req_addr), - .req_data_in (per_cluster_dram_req_data), - .req_tag_in (per_cluster_dram_req_tag), - .req_ready_in (per_cluster_dram_req_ready), + .req_valid_in (per_cluster_mem_req_valid), + .req_rw_in (per_cluster_mem_req_rw), + .req_byteen_in (per_cluster_mem_req_byteen), + .req_addr_in (per_cluster_mem_req_addr), + .req_data_in (per_cluster_mem_req_data), + .req_tag_in (per_cluster_mem_req_tag), + .req_ready_in (per_cluster_mem_req_ready), - // DRAM request - .req_valid_out (dram_req_valid), - .req_rw_out (dram_req_rw), - .req_byteen_out (dram_req_byteen), - .req_addr_out (dram_req_addr), - .req_data_out (dram_req_data), - .req_tag_out (dram_req_tag), - .req_ready_out (dram_req_ready), + // Memory request + .req_valid_out (mem_req_valid), + .req_rw_out (mem_req_rw), + .req_byteen_out (mem_req_byteen), + .req_addr_out (mem_req_addr), + .req_data_out (mem_req_data), + .req_tag_out (mem_req_tag), + .req_ready_out (mem_req_ready), // Core response - .rsp_valid_out (per_cluster_dram_rsp_valid), - .rsp_data_out (per_cluster_dram_rsp_data), - .rsp_tag_out (per_cluster_dram_rsp_tag), - .rsp_ready_out (per_cluster_dram_rsp_ready), + .rsp_valid_out (per_cluster_mem_rsp_valid), + .rsp_data_out (per_cluster_mem_rsp_data), + .rsp_tag_out (per_cluster_mem_rsp_tag), + .rsp_ready_out (per_cluster_mem_rsp_ready), - // DRAM response - .rsp_valid_in (dram_rsp_valid), - .rsp_tag_in (dram_rsp_tag), - .rsp_data_in (dram_rsp_data), - .rsp_ready_in (dram_rsp_ready) + // Memory response + .rsp_valid_in (mem_rsp_valid), + .rsp_tag_in (mem_rsp_tag), + .rsp_data_in (mem_rsp_data), + .rsp_ready_in (mem_rsp_ready) ); end `SCOPE_ASSIGN (reset, reset); - `SCOPE_ASSIGN (dram_req_fire, dram_req_valid && dram_req_ready); - `SCOPE_ASSIGN (dram_req_addr, `TO_FULL_ADDR(dram_req_addr)); - `SCOPE_ASSIGN (dram_req_rw, dram_req_rw); - `SCOPE_ASSIGN (dram_req_byteen, dram_req_byteen); - `SCOPE_ASSIGN (dram_req_data, dram_req_data); - `SCOPE_ASSIGN (dram_req_tag, dram_req_tag); - `SCOPE_ASSIGN (dram_rsp_fire, dram_rsp_valid && dram_rsp_ready); - `SCOPE_ASSIGN (dram_rsp_data, dram_rsp_data); - `SCOPE_ASSIGN (dram_rsp_tag, dram_rsp_tag); + `SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready); + `SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr)); + `SCOPE_ASSIGN (mem_req_rw, mem_req_rw); + `SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen); + `SCOPE_ASSIGN (mem_req_data, mem_req_data); + `SCOPE_ASSIGN (mem_req_tag, mem_req_tag); + `SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready); + `SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data); + `SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag); `SCOPE_ASSIGN (busy, busy); -`ifdef DBG_PRINT_DRAM +`ifdef DBG_PRINT_MEM always @(posedge clk) begin - if (dram_req_valid && dram_req_ready) begin - if (dram_req_rw) - $display("%t: DRAM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen, dram_req_data); + if (mem_req_valid && mem_req_ready) begin + if (mem_req_rw) + $display("%t: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data); else - $display("%t: DRAM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen); + $display("%t: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen); end - if (dram_rsp_valid && dram_rsp_ready) begin - $display("%t: DRAM Rsp: tag=%0h, data=%0h", $time, dram_rsp_tag, dram_rsp_data); + if (mem_rsp_valid && mem_rsp_ready) begin + $display("%t: MEM Rsp: tag=%0h, data=%0h", $time, mem_rsp_tag, mem_rsp_data); end end `endif diff --git a/hw/rtl/afu/VX_avs_wrapper.v b/hw/rtl/afu/VX_avs_wrapper.v index 7c0860c0..b3d0a5d5 100644 --- a/hw/rtl/afu/VX_avs_wrapper.v +++ b/hw/rtl/afu/VX_avs_wrapper.v @@ -1,133 +1,166 @@ `include "VX_define.vh" module VX_avs_wrapper #( - parameter AVS_DATAW = 1, - parameter AVS_ADDRW = 1, - parameter AVS_BURSTW = 1, - parameter AVS_BANKS = 1, - parameter REQ_TAGW = 1, - parameter RD_QUEUE_SIZE = 1, + parameter NUM_BANKS = 1, + parameter AVS_DATA_WIDTH = 1, + parameter AVS_ADDR_WIDTH = 1, + parameter AVS_BURST_WIDTH = 1, + parameter AVS_BANKS = 1, + parameter REQ_TAG_WIDTH = 1, + parameter RD_QUEUE_SIZE = 1, - parameter AVS_BYTEENW = (AVS_DATAW / 8), - parameter RD_QUEUE_ADDRW= $clog2(RD_QUEUE_SIZE+1), - parameter AVS_BANKS_BITS= $clog2(AVS_BANKS) + parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8), + parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1), + parameter AVS_BANKS_BITS = $clog2(AVS_BANKS) ) ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, + + // Memory request + input wire mem_req_valid, + input wire mem_req_rw, + input wire [AVS_BYTEENW-1:0] mem_req_byteen, + input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr, + input wire [AVS_DATA_WIDTH-1:0] mem_req_data, + input wire [REQ_TAG_WIDTH-1:0] mem_req_tag, + output wire mem_req_ready, + + // Memory response + output wire mem_rsp_valid, + output wire [AVS_DATA_WIDTH-1:0] mem_rsp_data, + output wire [REQ_TAG_WIDTH-1:0] mem_rsp_tag, + input wire mem_rsp_ready, // AVS bus - output wire [AVS_DATAW-1:0] avs_writedata, - input wire [AVS_DATAW-1:0] avs_readdata, - output wire [AVS_ADDRW-1:0] avs_address, - input wire avs_waitrequest, - output wire avs_write, - output wire avs_read, - output wire [AVS_BYTEENW-1:0] avs_byteenable, - output wire [AVS_BURSTW-1:0] avs_burstcount, - input avs_readdatavalid, - output wire [AVS_BANKS_BITS-1:0] avs_bankselect, - - // DRAM request - input wire dram_req_valid, - input wire dram_req_rw, - input wire [AVS_BYTEENW-1:0] dram_req_byteen, - input wire [AVS_ADDRW-1:0] dram_req_addr, - input wire [AVS_DATAW-1:0] dram_req_data, - input wire [REQ_TAGW-1:0] dram_req_tag, - output wire dram_req_ready, - - // DRAM response - output wire dram_rsp_valid, - output wire [AVS_DATAW-1:0] dram_rsp_data, - output wire [REQ_TAGW-1:0] dram_rsp_tag, - input wire dram_rsp_ready + output wire [AVS_DATA_WIDTH-1:0] avs_writedata [NUM_BANKS], + input wire [AVS_DATA_WIDTH-1:0] avs_readdata [NUM_BANKS], + output wire [AVS_ADDR_WIDTH-1:0] avs_address [NUM_BANKS], + input wire avs_waitrequest [NUM_BANKS], + output wire avs_write [NUM_BANKS], + output wire avs_read [NUM_BANKS], + output wire [AVS_BYTEENW-1:0] avs_byteenable [NUM_BANKS], + output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS], + input avs_readdatavalid [NUM_BANKS] ); - reg [AVS_BANKS_BITS-1:0] avs_bankselect_r; - reg [AVS_BURSTW-1:0] avs_burstcount_r; - wire avs_reqq_push = dram_req_valid && dram_req_ready && !dram_req_rw; - wire avs_reqq_pop = dram_rsp_valid && dram_rsp_ready; + localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); - wire avs_rspq_push = avs_readdatavalid; - wire avs_rspq_pop = avs_reqq_pop; - wire avs_rspq_empty; - - wire rsp_queue_going_full; - wire [RD_QUEUE_ADDRW-1:0] rsp_queue_size; - VX_pending_size #( - .SIZE (RD_QUEUE_SIZE) - ) pending_size ( - .clk (clk), - .reset (reset), - .push (avs_reqq_push), - .pop (avs_rspq_pop), - `UNUSED_PIN (empty), - .full (rsp_queue_going_full), - .size (rsp_queue_size) - ); - `UNUSED_VAR (rsp_queue_size) - - always @(posedge clk) begin - avs_burstcount_r <= 1; - avs_bankselect_r <= 0; - end + // Requests handling - VX_fifo_queue #( - .DATAW (REQ_TAGW), - .SIZE (RD_QUEUE_SIZE) - ) rd_req_queue ( - .clk (clk), - .reset (reset), - .push (avs_reqq_push), - .pop (avs_reqq_pop), - .data_in (dram_req_tag), - .data_out (dram_rsp_tag), - `UNUSED_PIN (empty), - `UNUSED_PIN (full), - `UNUSED_PIN (alm_empty), - `UNUSED_PIN (alm_full), - `UNUSED_PIN (size) + wire [NUM_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready; + wire [NUM_BANKS-1:0] req_queue_going_full; + wire [NUM_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size; + wire [NUM_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_data_out; + + wire [BANK_ADDRW-1:0] req_bank_sel = (NUM_BANKS >= 2) ? mem_req_addr[BANK_ADDRW-1:0] : '0; + + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i]; + assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i); + end + + for (genvar i = 0; i < NUM_BANKS; i++) begin + VX_pending_size #( + .SIZE (RD_QUEUE_SIZE) + ) pending_size ( + .clk (clk), + .reset (reset), + .push (avs_reqq_push[i]), + .pop (avs_reqq_pop[i]), + .full (req_queue_going_full[i]), + .size (req_queue_size[i]), + `UNUSED_PIN (empty) + ); + `UNUSED_VAR (req_queue_size) + + VX_fifo_queue #( + .DATAW (REQ_TAG_WIDTH), + .SIZE (RD_QUEUE_SIZE) + ) rd_req_queue ( + .clk (clk), + .reset (reset), + .push (avs_reqq_push[i]), + .pop (avs_reqq_pop[i]), + .data_in (mem_req_tag), + .data_out (avs_reqq_data_out[i]), + `UNUSED_PIN (empty), + `UNUSED_PIN (full), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (size) + ); + end + + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i); + assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i); + assign avs_address[i] = mem_req_addr; + assign avs_byteenable[i] = mem_req_byteen; + assign avs_writedata[i] = mem_req_data; + assign avs_burstcount[i] = AVS_BURST_WIDTH'(1); + end + + assign mem_req_ready = avs_reqq_ready[req_bank_sel]; + + // Responses handling + + wire [NUM_BANKS-1:0] rsp_arb_valid_in; + wire [NUM_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in; + wire [NUM_BANKS-1:0] rsp_arb_ready_in; + + wire [NUM_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out; + wire [NUM_BANKS-1:0] avs_rspq_empty; + + for (genvar i = 0; i < NUM_BANKS; i++) begin + VX_fifo_queue #( + .DATAW (AVS_DATA_WIDTH), + .SIZE (RD_QUEUE_SIZE) + ) rd_rsp_queue ( + .clk (clk), + .reset (reset), + .push (avs_readdatavalid[i]), + .pop (avs_reqq_pop[i]), + .data_in (avs_readdata[i]), + .data_out (avs_rspq_data_out[i]), + .empty (avs_rspq_empty[i]), + `UNUSED_PIN (full), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (size) + ); + end + + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign rsp_arb_valid_in[i] = !avs_rspq_empty[i]; + assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_data_out[i]}; + assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i]; + end + + VX_stream_arbiter #( + .NUM_REQS (NUM_BANKS), + .DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH), + .BUFFERED (NUM_BANKS > 2) + ) rsp_arb ( + .clk (clk), + .reset (reset), + .valid_in (rsp_arb_valid_in), + .data_in (rsp_arb_data_in), + .ready_in (rsp_arb_ready_in), + .valid_out (mem_rsp_valid), + .data_out ({mem_rsp_data, mem_rsp_tag}), + .ready_out (mem_rsp_ready) ); - VX_fifo_queue #( - .DATAW (AVS_DATAW), - .SIZE (RD_QUEUE_SIZE) - ) rd_rsp_queue ( - .clk (clk), - .reset (reset), - .push (avs_rspq_push), - .pop (avs_rspq_pop), - .data_in (avs_readdata), - .data_out (dram_rsp_data), - .empty (avs_rspq_empty), - `UNUSED_PIN (full), - `UNUSED_PIN (alm_empty), - `UNUSED_PIN (alm_full), - `UNUSED_PIN (size) - ); - - assign avs_read = dram_req_valid && !dram_req_rw && !rsp_queue_going_full; - assign avs_write = dram_req_valid && dram_req_rw && !rsp_queue_going_full; - assign avs_address = dram_req_addr; - assign avs_byteenable = dram_req_byteen; - assign avs_writedata = dram_req_data; - assign avs_burstcount = avs_burstcount_r; - assign avs_bankselect = avs_bankselect_r; - - assign dram_req_ready = !avs_waitrequest && !rsp_queue_going_full; - - assign dram_rsp_valid = !avs_rspq_empty; - `ifdef DBG_PRINT_AVS always @(posedge clk) begin - if (dram_req_valid && dram_req_ready) begin - if (dram_req_rw) - $display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, dram_req_data); + if (mem_req_valid && mem_req_ready) begin + if (mem_req_rw) + $display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data); else - $display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, rsp_queue_size); + $display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size); end - if (dram_rsp_valid && dram_rsp_ready) begin - $display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, dram_rsp_tag, dram_rsp_data, rsp_queue_size); + if (mem_rsp_valid && mem_rsp_ready) begin + $display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, mem_rsp_tag, mem_rsp_data, req_queue_size); end end `endif diff --git a/hw/rtl/afu/VX_to_mem.v b/hw/rtl/afu/VX_to_mem.v new file mode 100644 index 00000000..c9c6287c --- /dev/null +++ b/hw/rtl/afu/VX_to_mem.v @@ -0,0 +1,178 @@ +`include "VX_define.vh" + +module VX_to_mem #( + parameter SRC_DATA_WIDTH = 1, + parameter SRC_ADDR_WIDTH = 1, + parameter DST_DATA_WIDTH = 1, + parameter DST_ADDR_WIDTH = 1, + parameter SRC_TAG_WIDTH = 1, + parameter DST_TAG_WIDTH = 1, + parameter SRC_DATA_SIZE = (SRC_DATA_WIDTH / 8), + parameter DST_DATA_SIZE = (DST_DATA_WIDTH / 8) +) ( + input wire clk, + input wire reset, + + input wire mem_req_valid_in, + input wire [SRC_ADDR_WIDTH-1:0] mem_req_addr_in, + input wire mem_req_rw_in, + input wire [SRC_DATA_SIZE-1:0] mem_req_byteen_in, + input wire [SRC_DATA_WIDTH-1:0] mem_req_data_in, + input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in, + output wire mem_req_ready_in, + + output wire mem_req_valid_out, + output wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out, + output wire mem_req_rw_out, + output wire [DST_DATA_SIZE-1:0] mem_req_byteen_out, + output wire [DST_DATA_WIDTH-1:0] mem_req_data_out, + output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out, + input wire mem_req_ready_out, + + input wire mem_rsp_valid_in, + input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_in, + input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in, + output wire mem_rsp_ready_in, + + output wire mem_rsp_valid_out, + output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_out, + output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_out, + input wire mem_rsp_ready_out +); + `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) + + localparam DST_LDATAW = $clog2(DST_DATA_WIDTH); + localparam SRC_LDATAW = $clog2(SRC_DATA_WIDTH); + localparam D = `ABS(DST_LDATAW - SRC_LDATAW); + localparam P = 2**D; + + `UNUSED_VAR (mem_rsp_tag_in) + + if (DST_LDATAW > SRC_LDATAW) begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + wire [D-1:0] req_idx = mem_req_addr_in[D-1:0]; + wire [D-1:0] rsp_idx = mem_rsp_tag_in[D-1:0]; + + wire [SRC_ADDR_WIDTH-D-1:0] mem_req_addr_in_qual = mem_req_addr_in[SRC_ADDR_WIDTH-1:D]; + + wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w = mem_rsp_data_in; + + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin + `UNUSED_VAR (mem_req_addr_in_qual) + assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; + end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin + assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual); + end else begin + assign mem_req_addr_out = mem_req_addr_in_qual; + end + + assign mem_req_valid_out = mem_req_valid_in; + assign mem_req_rw_out = mem_req_rw_in; + assign mem_req_byteen_out = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); + assign mem_req_data_out = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW); + assign mem_req_tag_out = DST_TAG_WIDTH'({mem_req_tag_in, req_idx}); + assign mem_req_ready_in = mem_req_ready_out; + + assign mem_rsp_valid_out = mem_rsp_valid_in; + assign mem_rsp_data_out = mem_rsp_data_in_w[rsp_idx]; + assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in[SRC_TAG_WIDTH+D-1:D]); + assign mem_rsp_ready_in = mem_rsp_ready_out; + + end else if (DST_LDATAW < SRC_LDATAW) begin + + reg [D-1:0] req_ctr, rsp_ctr; + + reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n; + + wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out; + wire mem_rsp_in_fire = mem_rsp_valid_in && mem_rsp_ready_in; + + wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in; + wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in; + + always @(*) begin + mem_rsp_data_out_n = mem_rsp_data_out_r; + mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_in; + end + + always @(posedge clk) begin + if (reset) begin + req_ctr <= 0; + rsp_ctr <= 0; + end else begin + if (mem_req_out_fire) begin + req_ctr <= req_ctr + 1; + end + if (mem_rsp_in_fire) begin + rsp_ctr <= rsp_ctr + 1; + mem_rsp_data_out_r <= mem_rsp_data_out_n; + end + end + end + + reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r; + wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_w; + + always @(posedge clk) begin + if (mem_rsp_in_fire) begin + mem_rsp_tag_in_r <= mem_rsp_tag_in; + end + end + assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in; + `RUNTIME_ASSERT((mem_rsp_tag_in_w == mem_rsp_tag_in), ("oops!")) + + wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; + + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin + `UNUSED_VAR (mem_req_addr_in_qual) + assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; + end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin + assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual); + end else begin + assign mem_req_addr_out = mem_req_addr_in_qual; + end + + assign mem_req_valid_out = mem_req_valid_in; + assign mem_req_rw_out = mem_req_rw_in; + assign mem_req_byteen_out = mem_req_byteen_in_w[req_ctr]; + assign mem_req_data_out = mem_req_data_in_w[req_ctr]; + assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in); + assign mem_req_ready_in = mem_req_ready_out && (req_ctr == (P-1)); + + assign mem_rsp_valid_out = mem_rsp_valid_in && (rsp_ctr == (P-1)); + assign mem_rsp_data_out = mem_rsp_data_out_n; + assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in); + assign mem_rsp_ready_in = mem_rsp_ready_out; + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin + `UNUSED_VAR (mem_req_addr_in) + assign mem_req_addr_out = mem_req_addr_in[DST_ADDR_WIDTH-1:0]; + end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin + assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in); + end else begin + assign mem_req_addr_out = mem_req_addr_in; + end + + assign mem_req_valid_out = mem_req_valid_in; + assign mem_req_rw_out = mem_req_rw_in; + assign mem_req_byteen_out = mem_req_byteen_in; + assign mem_req_data_out = mem_req_data_in; + assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in); + assign mem_req_ready_in = mem_req_ready_out; + + assign mem_rsp_valid_out = mem_rsp_valid_in; + assign mem_rsp_data_out = mem_rsp_data_in; + assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in); + assign mem_rsp_ready_in = mem_rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/afu/ccip_std_afu.sv b/hw/rtl/afu/ccip_std_afu.sv index 4534d40c..2adea591 100644 --- a/hw/rtl/afu/ccip_std_afu.sv +++ b/hw/rtl/afu/ccip_std_afu.sv @@ -77,30 +77,28 @@ module ccip_std_afu #( // User AFU goes here // ==================================================================== - // - // vortex_afu depends on CCI-P and local memory being in the same - // clock domain. This is accomplished by choosing a common clock - // in the AFU's JSON description. The platform instantiates clock- - // crossing shims automatically, as needed. - // + t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS]; + logic avs_waitrequest [NUM_LOCAL_MEM_BANKS]; + t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS]; + logic avs_readdatavalid [NUM_LOCAL_MEM_BANKS]; + t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS]; + t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS]; + t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS]; + logic avs_write [NUM_LOCAL_MEM_BANKS]; + logic avs_read [NUM_LOCAL_MEM_BANKS]; - // - // Memory banks are used very simply here. Only bank is active at - // a time, selected by mem_bank_select. mem_bank_select is set - // by a CSR from the host. - // - t_local_mem_byte_mask avs_byteenable; - logic avs_waitrequest; - t_local_mem_data avs_readdata; - logic avs_readdatavalid; - t_local_mem_burst_cnt avs_burstcount; - t_local_mem_data avs_writedata; - t_local_mem_addr avs_address; - logic avs_write; - logic avs_read; - - // choose which memory bank to test - logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select; + for (genvar b = 0; b < NUM_LOCAL_MEM_BANKS; b++) begin + assign local_mem[b].burstcount = avs_burstcount[b]; + assign local_mem[b].writedata = avs_writedata[b]; + assign local_mem[b].address = avs_address[b]; + assign local_mem[b].byteenable = avs_byteenable[b]; + assign local_mem[b].write = avs_write[b]; + assign local_mem[b].read = avs_read[b]; + + assign avs_waitrequest[b] = local_mem[b].waitrequest; + assign avs_readdata[b] = local_mem[b].readdata; + assign avs_readdatavalid[b] = local_mem[b].readdatavalid; + end vortex_afu #( .NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS) @@ -108,6 +106,9 @@ module ccip_std_afu #( .clk (clk), .reset (reset_T1), + .cp2af_sRxPort (cp2af_sRx_T1), + .af2cp_sTxPort (af2cp_sTx_T0), + .avs_writedata (avs_writedata), .avs_readdata (avs_readdata), .avs_address (avs_address), @@ -116,52 +117,7 @@ module ccip_std_afu #( .avs_read (avs_read), .avs_byteenable (avs_byteenable), .avs_burstcount (avs_burstcount), - .avs_readdatavalid (avs_readdatavalid), - .mem_bank_select (mem_bank_select), - - .cp2af_sRxPort (cp2af_sRx_T1), - .af2cp_sTxPort (af2cp_sTx_T0) - ); - - // - // Export the local memory interface signals as vectors so that bank - // selection can use array syntax. - // - logic avs_waitrequest_v[NUM_LOCAL_MEM_BANKS]; - t_local_mem_data avs_readdata_v[NUM_LOCAL_MEM_BANKS]; - logic avs_readdatavalid_v[NUM_LOCAL_MEM_BANKS]; - - genvar b; - generate - for (b = 0; b < NUM_LOCAL_MEM_BANKS; b = b + 1) - begin : lmb - always_comb - begin - // Local memory to AFU signals - avs_waitrequest_v[b] = local_mem[b].waitrequest; - avs_readdata_v[b] = local_mem[b].readdata; - avs_readdatavalid_v[b] = local_mem[b].readdatavalid; - - // Replicate address and write data to all banks. Only - // the request signals have to be bank-specific. - local_mem[b].burstcount = avs_burstcount; - local_mem[b].writedata = avs_writedata; - local_mem[b].address = avs_address; - local_mem[b].byteenable = avs_byteenable; - - // Request a write to this bank? - local_mem[b].write = avs_write && - ($bits(mem_bank_select)'(b) == mem_bank_select); - - // Request a read from this bank? - local_mem[b].read = avs_read && - ($bits(mem_bank_select)'(b) == mem_bank_select); - end - end - endgenerate - - assign avs_waitrequest = avs_waitrequest_v[mem_bank_select]; - assign avs_readdata = avs_readdata_v[mem_bank_select]; - assign avs_readdatavalid = avs_readdatavalid_v[mem_bank_select]; + .avs_readdatavalid (avs_readdatavalid) + ); endmodule diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 4d6cae37..19e694ed 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -1,13 +1,18 @@ -`include "VX_define.vh" -`ifndef NOPAE -`include "afu_json_info.vh" -`else +`include "VX_platform.vh" +`ifdef NOPAE +`IGNORE_WARNINGS_BEGIN `include "vortex_afu.vh" +`IGNORE_WARNINGS_END +`else +`include "afu_json_info.vh" `endif + /* verilator lint_off IMPORTSTAR */ import ccip_if_pkg::*; import local_mem_cfg_pkg::*; -/* verilator lint_on IMPORTSTAR */ +/* verilator lint_on IMPORTSTAR */ + +`include "VX_define.vh" module vortex_afu #( parameter NUM_LOCAL_MEM_BANKS = 2 @@ -21,30 +26,32 @@ module vortex_afu #( output t_if_ccip_Tx af2cp_sTxPort, // Avalon signals for local memory access - output t_local_mem_data avs_writedata, - input t_local_mem_data avs_readdata, - output t_local_mem_addr avs_address, - input logic avs_waitrequest, - output logic avs_write, - output logic avs_read, - output t_local_mem_byte_mask avs_byteenable, - output t_local_mem_burst_cnt avs_burstcount, - input avs_readdatavalid, - - output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select + output t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS], + input t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS], + output t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS], + input logic avs_waitrequest [NUM_LOCAL_MEM_BANKS], + output logic avs_write [NUM_LOCAL_MEM_BANKS], + output logic avs_read [NUM_LOCAL_MEM_BANKS], + output t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS], + output t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS], + input avs_readdatavalid [NUM_LOCAL_MEM_BANKS] ); localparam RESET_DELAY = 3; -localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr); -localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data); -localparam DRAM_LINE_LW = $clog2(DRAM_LINE_WIDTH); +localparam LMEM_LINE_WIDTH = $bits(t_local_mem_data); +localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr); +localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt); -localparam VX_DRAM_LINE_LW = $clog2(`VX_DRAM_LINE_WIDTH); -localparam VX_DRAM_LINE_IDX = (DRAM_LINE_LW - VX_DRAM_LINE_LW); +localparam CCI_LINE_WIDTH = $bits(t_ccip_clData); +localparam CCI_LINE_SIZE = CCI_LINE_WIDTH / 8; +localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_LINE_WIDTH / 8); localparam AVS_RD_QUEUE_SIZE = 16; -localparam AVS_REQ_TAGW = `VX_DRAM_TAG_WIDTH + VX_DRAM_LINE_IDX; +localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, `VX_MEM_TAG_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(`VX_MEM_LINE_WIDTH)); +localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(CCI_LINE_WIDTH)); +localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI); + localparam CCI_RD_WINDOW_SIZE = 8; localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE; @@ -74,7 +81,7 @@ localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA; localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ; localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE); -localparam CCI_RD_RQ_DATAW = $bits(t_ccip_clData) + CCI_RD_RQ_TAGW; +localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW; localparam STATE_IDLE = 0; localparam STATE_READ = 1; @@ -96,18 +103,18 @@ reg [STATE_WIDTH-1:0] state; // Vortex ports /////////////////////////////////////////////////////////////// -wire vx_dram_req_valid; -wire vx_dram_req_rw; -wire [`VX_DRAM_BYTEEN_WIDTH-1:0] vx_dram_req_byteen; -wire [`VX_DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr; -wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_req_data; -wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; -wire vx_dram_req_ready; +wire vx_mem_req_valid; +wire vx_mem_req_rw; +wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen; +wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr; +wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_req_data; +wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag; +wire vx_mem_req_ready; -wire vx_dram_rsp_valid; -wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data; -wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; -wire vx_dram_rsp_ready; +wire vx_mem_rsp_valid; +wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data; +wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag; +wire vx_mem_rsp_ready; wire vx_csr_io_req_valid; wire [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid; @@ -123,13 +130,13 @@ wire vx_csr_io_rsp_ready; wire vx_busy; reg vx_reset; -reg vx_dram_en; +reg vx_mem_en; // CMD variables ////////////////////////////////////////////////////////////// t_ccip_clAddr cmd_io_addr; -reg [DRAM_ADDR_WIDTH-1:0] cmd_mem_addr; -reg [DRAM_ADDR_WIDTH-1:0] cmd_data_size; +reg [CCI_ADDR_WIDTH-1:0] cmd_mem_addr; +reg [CCI_ADDR_WIDTH-1:0] cmd_data_size; `ifdef SCOPE wire [63:0] cmd_scope_rdata; @@ -216,9 +223,9 @@ always @(posedge clk) begin `endif end MMIO_MEM_ADDR: begin - cmd_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); + cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_local_mem_addr'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data)); `endif end MMIO_DATA_SIZE: begin @@ -335,7 +342,7 @@ always @(posedge clk) begin if (reset) begin state <= STATE_IDLE; vx_reset <= 0; - vx_dram_en <= 0; + vx_mem_en <= 0; end else begin case (state) STATE_IDLE: begin @@ -399,14 +406,14 @@ always @(posedge clk) begin // vortex reset cycles if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin vx_reset <= 0; - vx_dram_en <= 1; + vx_mem_en <= 1; state <= STATE_RUN; end end STATE_RUN: begin if (cmd_run_done) begin - vx_dram_en <= 0; + vx_mem_en <= 0; state <= STATE_IDLE; `ifdef DBG_PRINT_OPAE $display("%t: STATE IDLE", $time); @@ -442,187 +449,251 @@ end // AVS Controller ///////////////////////////////////////////////////////////// -wire dram_req_valid; -wire dram_req_rw; -t_local_mem_byte_mask dram_req_byteen; -t_local_mem_addr dram_req_addr; -t_local_mem_data dram_req_data; -wire [AVS_REQ_TAGW:0] dram_req_tag; -wire dram_req_ready; - -wire dram_rsp_valid; -t_local_mem_data dram_rsp_data; -wire [AVS_REQ_TAGW:0] dram_rsp_tag; -wire dram_rsp_ready; - -wire cci_dram_req_valid; -wire cci_dram_req_rw; -t_local_mem_byte_mask cci_dram_req_byteen; -t_local_mem_addr cci_dram_req_addr; -t_local_mem_data cci_dram_req_data; -wire [AVS_REQ_TAGW-1:0] cci_dram_req_tag; -wire cci_dram_req_ready; - -wire cci_dram_rsp_valid; -t_local_mem_data cci_dram_rsp_data; -wire [AVS_REQ_TAGW-1:0] cci_dram_rsp_tag; -wire cci_dram_rsp_ready; - -wire vx_dram_req_valid_qual; -t_local_mem_addr vx_dram_req_addr_qual; -t_local_mem_byte_mask vx_dram_req_byteen_qual; -t_local_mem_data vx_dram_req_data_qual; -wire [AVS_REQ_TAGW-1:0] vx_dram_req_tag_qual; - -wire [(1 << VX_DRAM_LINE_IDX)-1:0][`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data_unqual; -wire [AVS_REQ_TAGW-1:0] vx_dram_rsp_tag_unqual; - -wire cci_dram_rd_req_valid, cci_dram_wr_req_valid; -wire [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr; +wire cci_mem_rd_req_valid; +wire cci_mem_wr_req_valid; wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout; -//-- +wire cci_mem_req_valid; +wire cci_mem_req_rw; +wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_addr; +wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag; +wire cci_mem_req_ready; -assign cci_dram_req_valid = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_valid : cci_dram_rd_req_valid; -assign cci_dram_req_addr = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_addr : cci_dram_rd_req_addr; -assign cci_dram_req_rw = (CMD_MEM_WRITE == state); -assign cci_dram_req_byteen = {64{1'b1}}; -assign cci_dram_req_data = cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]; -assign cci_dram_req_tag = AVS_REQ_TAGW'(0); - -`UNUSED_VAR (cci_dram_rsp_tag) +wire cci_mem_rsp_valid; +wire [CCI_LINE_WIDTH-1:0] cci_mem_rsp_data; +wire [CCI_ADDR_WIDTH-1:0] cci_mem_rsp_tag; +wire cci_mem_rsp_ready; //-- -assign vx_dram_req_valid_qual = vx_dram_req_valid && vx_dram_en; +wire cci_mem_req_arb_valid; +wire cci_mem_req_arb_rw; +t_local_mem_byte_mask cci_mem_req_arb_byteen; +t_local_mem_addr cci_mem_req_arb_addr; +t_local_mem_data cci_mem_req_arb_data; +wire [AVS_REQ_TAGW-1:0] cci_mem_req_arb_tag; +wire cci_mem_req_arb_ready; -assign vx_dram_req_addr_qual = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH]; +wire cci_mem_rsp_arb_valid; +t_local_mem_data cci_mem_rsp_arb_data; +wire [AVS_REQ_TAGW-1:0] cci_mem_rsp_arb_tag; +wire cci_mem_rsp_arb_ready; -if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin - wire [VX_DRAM_LINE_IDX-1:0] vx_dram_req_idx = vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]; - wire [VX_DRAM_LINE_IDX-1:0] vx_dram_rsp_idx = vx_dram_rsp_tag_unqual[VX_DRAM_LINE_IDX-1:0]; - assign vx_dram_req_byteen_qual = 64'(vx_dram_req_byteen) << (6'(vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]) << (VX_DRAM_LINE_LW-3)); - assign vx_dram_req_data_qual = DRAM_LINE_WIDTH'(vx_dram_req_data) << ((DRAM_LINE_LW'(vx_dram_req_idx)) << VX_DRAM_LINE_LW); - assign vx_dram_req_tag_qual = {vx_dram_req_tag, vx_dram_req_idx}; - assign vx_dram_rsp_data = vx_dram_rsp_data_unqual[vx_dram_rsp_idx]; -end else begin - assign vx_dram_req_byteen_qual = vx_dram_req_byteen; - assign vx_dram_req_tag_qual = vx_dram_req_tag; - assign vx_dram_req_data_qual = vx_dram_req_data; - assign vx_dram_rsp_data = vx_dram_rsp_data_unqual; -end +VX_to_mem #( + .SRC_DATA_WIDTH (CCI_LINE_WIDTH), + .DST_DATA_WIDTH (LMEM_LINE_WIDTH), + .SRC_ADDR_WIDTH (CCI_ADDR_WIDTH), + .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), + .SRC_TAG_WIDTH (CCI_ADDR_WIDTH), + .DST_TAG_WIDTH (AVS_REQ_TAGW) +) cci_to_mem ( + .clk (clk), + .reset (reset), -assign vx_dram_rsp_tag = vx_dram_rsp_tag_unqual[`VX_DRAM_TAG_WIDTH+VX_DRAM_LINE_IDX-1:VX_DRAM_LINE_IDX]; + .mem_req_valid_in (cci_mem_req_valid), + .mem_req_addr_in (cci_mem_req_addr), + .mem_req_rw_in (cci_mem_req_rw), + .mem_req_byteen_in ({CCI_LINE_SIZE{1'b1}}), + .mem_req_data_in (cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]), + .mem_req_tag_in (cci_mem_req_tag), + .mem_req_ready_in (cci_mem_req_ready), + + .mem_req_valid_out (cci_mem_req_arb_valid), + .mem_req_addr_out (cci_mem_req_arb_addr), + .mem_req_rw_out (cci_mem_req_arb_rw), + .mem_req_byteen_out (cci_mem_req_arb_byteen), + .mem_req_data_out (cci_mem_req_arb_data), + .mem_req_tag_out (cci_mem_req_arb_tag), + .mem_req_ready_out (cci_mem_req_arb_ready), + + .mem_rsp_valid_in (cci_mem_rsp_arb_valid), + .mem_rsp_data_in (cci_mem_rsp_arb_data), + .mem_rsp_tag_in (cci_mem_rsp_arb_tag), + .mem_rsp_ready_in (cci_mem_rsp_arb_ready), + + .mem_rsp_valid_out (cci_mem_rsp_valid), + .mem_rsp_data_out (cci_mem_rsp_data), + .mem_rsp_tag_out (cci_mem_rsp_tag), + .mem_rsp_ready_out (cci_mem_rsp_ready) +); //-- +wire vx_mem_req_arb_valid; +wire vx_mem_req_arb_rw; +t_local_mem_byte_mask vx_mem_req_arb_byteen; +t_local_mem_addr vx_mem_req_arb_addr; +t_local_mem_data vx_mem_req_arb_data; +wire [AVS_REQ_TAGW-1:0] vx_mem_req_arb_tag; +wire vx_mem_req_arb_ready; + +wire vx_mem_rsp_arb_valid; +t_local_mem_data vx_mem_rsp_arb_data; +wire [AVS_REQ_TAGW-1:0] vx_mem_rsp_arb_tag; +wire vx_mem_rsp_arb_ready; + +VX_to_mem #( + .SRC_DATA_WIDTH (`VX_MEM_LINE_WIDTH), + .DST_DATA_WIDTH (LMEM_LINE_WIDTH), + .SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH), + .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), + .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), + .DST_TAG_WIDTH (AVS_REQ_TAGW) +) vx_to_mem ( + .clk (clk), + .reset (reset), + + .mem_req_valid_in (vx_mem_req_valid && vx_mem_en), + .mem_req_addr_in (vx_mem_req_addr), + .mem_req_rw_in (vx_mem_req_rw), + .mem_req_byteen_in (vx_mem_req_byteen), + .mem_req_data_in (vx_mem_req_data), + .mem_req_tag_in (vx_mem_req_tag), + .mem_req_ready_in (vx_mem_req_ready), + + .mem_req_valid_out (vx_mem_req_arb_valid), + .mem_req_addr_out (vx_mem_req_arb_addr), + .mem_req_rw_out (vx_mem_req_arb_rw), + .mem_req_byteen_out (vx_mem_req_arb_byteen), + .mem_req_data_out (vx_mem_req_arb_data), + .mem_req_tag_out (vx_mem_req_arb_tag), + .mem_req_ready_out (vx_mem_req_arb_ready), + + .mem_rsp_valid_in (vx_mem_rsp_arb_valid), + .mem_rsp_data_in (vx_mem_rsp_arb_data), + .mem_rsp_tag_in (vx_mem_rsp_arb_tag), + .mem_rsp_ready_in (vx_mem_rsp_arb_ready), + + .mem_rsp_valid_out (vx_mem_rsp_valid), + .mem_rsp_data_out (vx_mem_rsp_data), + .mem_rsp_tag_out (vx_mem_rsp_tag), + .mem_rsp_ready_out (vx_mem_rsp_ready) +); + +//-- + +wire mem_req_valid; +wire mem_req_rw; +t_local_mem_byte_mask mem_req_byteen; +t_local_mem_addr mem_req_addr; +t_local_mem_data mem_req_data; +wire [AVS_REQ_TAGW:0] mem_req_tag; +wire mem_req_ready; + +wire mem_rsp_valid; +t_local_mem_data mem_rsp_data; +wire [AVS_REQ_TAGW:0] mem_rsp_tag; +wire mem_rsp_ready; + VX_mem_arb #( .NUM_REQS (2), - .DATA_WIDTH ($bits(t_local_mem_data)), - .ADDR_WIDTH ($bits(t_local_mem_addr)), + .DATA_WIDTH (LMEM_LINE_WIDTH), + .ADDR_WIDTH (LMEM_ADDR_WIDTH), .TAG_IN_WIDTH (AVS_REQ_TAGW), .TAG_OUT_WIDTH (AVS_REQ_TAGW+1) -) dram_arb ( +) mem_arb ( .clk (clk), .reset (reset), // Source request - .req_valid_in ({cci_dram_req_valid, vx_dram_req_valid_qual}), - .req_rw_in ({cci_dram_req_rw, vx_dram_req_rw}), - .req_byteen_in ({cci_dram_req_byteen, vx_dram_req_byteen_qual}), - .req_addr_in ({cci_dram_req_addr, vx_dram_req_addr_qual}), - .req_data_in ({cci_dram_req_data, vx_dram_req_data_qual}), - .req_tag_in ({cci_dram_req_tag, vx_dram_req_tag_qual}), - .req_ready_in ({cci_dram_req_ready, vx_dram_req_ready}), + .req_valid_in ({cci_mem_req_arb_valid, vx_mem_req_arb_valid}), + .req_rw_in ({cci_mem_req_arb_rw, vx_mem_req_arb_rw}), + .req_byteen_in ({cci_mem_req_arb_byteen, vx_mem_req_arb_byteen}), + .req_addr_in ({cci_mem_req_arb_addr, vx_mem_req_arb_addr}), + .req_data_in ({cci_mem_req_arb_data, vx_mem_req_arb_data}), + .req_tag_in ({cci_mem_req_arb_tag, vx_mem_req_arb_tag}), + .req_ready_in ({cci_mem_req_arb_ready, vx_mem_req_arb_ready}), - // DRAM request - .req_valid_out (dram_req_valid), - .req_rw_out (dram_req_rw), - .req_byteen_out (dram_req_byteen), - .req_addr_out (dram_req_addr), - .req_data_out (dram_req_data), - .req_tag_out (dram_req_tag), - .req_ready_out (dram_req_ready), + // Memory request + .req_valid_out (mem_req_valid), + .req_rw_out (mem_req_rw), + .req_byteen_out (mem_req_byteen), + .req_addr_out (mem_req_addr), + .req_data_out (mem_req_data), + .req_tag_out (mem_req_tag), + .req_ready_out (mem_req_ready), // Source response - .rsp_valid_out ({cci_dram_rsp_valid, vx_dram_rsp_valid}), - .rsp_data_out ({cci_dram_rsp_data, vx_dram_rsp_data_unqual}), - .rsp_tag_out ({cci_dram_rsp_tag, vx_dram_rsp_tag_unqual}), - .rsp_ready_out ({cci_dram_rsp_ready, vx_dram_rsp_ready}), + .rsp_valid_out ({cci_mem_rsp_arb_valid, vx_mem_rsp_arb_valid}), + .rsp_data_out ({cci_mem_rsp_arb_data, vx_mem_rsp_arb_data}), + .rsp_tag_out ({cci_mem_rsp_arb_tag, vx_mem_rsp_arb_tag}), + .rsp_ready_out ({cci_mem_rsp_arb_ready, vx_mem_rsp_arb_ready}), - // DRAM response - .rsp_valid_in (dram_rsp_valid), - .rsp_tag_in (dram_rsp_tag), - .rsp_data_in (dram_rsp_data), - .rsp_ready_in (dram_rsp_ready) + // Memory response + .rsp_valid_in (mem_rsp_valid), + .rsp_tag_in (mem_rsp_tag), + .rsp_data_in (mem_rsp_data), + .rsp_ready_in (mem_rsp_ready) ); //-- VX_avs_wrapper #( - .AVS_DATAW ($bits(t_local_mem_data)), - .AVS_ADDRW ($bits(t_local_mem_addr)), - .AVS_BURSTW ($bits(t_local_mem_burst_cnt)), - .AVS_BANKS (NUM_LOCAL_MEM_BANKS), - .REQ_TAGW (AVS_REQ_TAGW+1), - .RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE) + .NUM_BANKS (NUM_LOCAL_MEM_BANKS), + .AVS_DATA_WIDTH (LMEM_LINE_WIDTH), + .AVS_ADDR_WIDTH (LMEM_ADDR_WIDTH), + .AVS_BURST_WIDTH (LMEM_BURST_CTRW), + .AVS_BANKS (NUM_LOCAL_MEM_BANKS), + .REQ_TAG_WIDTH (AVS_REQ_TAGW + 1), + .RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE) ) avs_wrapper ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), + + // Memory request + .mem_req_valid (mem_req_valid), + .mem_req_rw (mem_req_rw), + .mem_req_byteen (mem_req_byteen), + .mem_req_addr (mem_req_addr), + .mem_req_data (mem_req_data), + .mem_req_tag (mem_req_tag), + .mem_req_ready (mem_req_ready), + + // Memory response + .mem_rsp_valid (mem_rsp_valid), + .mem_rsp_data (mem_rsp_data), + .mem_rsp_tag (mem_rsp_tag), + .mem_rsp_ready (mem_rsp_ready), // AVS bus - .avs_writedata (avs_writedata), - .avs_readdata (avs_readdata), - .avs_address (avs_address), - .avs_waitrequest (avs_waitrequest), - .avs_write (avs_write), - .avs_read (avs_read), - .avs_byteenable (avs_byteenable), - .avs_burstcount (avs_burstcount), - .avs_readdatavalid (avs_readdatavalid), - .avs_bankselect (mem_bank_select), - - // DRAM request - .dram_req_valid (dram_req_valid), - .dram_req_rw (dram_req_rw), - .dram_req_byteen (dram_req_byteen), - .dram_req_addr (dram_req_addr), - .dram_req_data (dram_req_data), - .dram_req_tag (dram_req_tag), - .dram_req_ready (dram_req_ready), - - // DRAM response - .dram_rsp_valid (dram_rsp_valid), - .dram_rsp_data (dram_rsp_data), - .dram_rsp_tag (dram_rsp_tag), - .dram_rsp_ready (dram_rsp_ready) + .avs_writedata (avs_writedata), + .avs_readdata (avs_readdata), + .avs_address (avs_address), + .avs_waitrequest (avs_waitrequest), + .avs_write (avs_write), + .avs_read (avs_read), + .avs_byteenable (avs_byteenable), + .avs_burstcount (avs_burstcount), + .avs_readdatavalid(avs_readdatavalid) ); // CCI-P Read Request /////////////////////////////////////////////////////////// -reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_ctr; -reg [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr; -wire [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr_next; -reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_addr_unqual; -wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag, cci_rd_rsp_tag; +reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr; +wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr; +reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_unqual; +reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr; +wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next; +wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag; +wire [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_tag; reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr; -t_ccip_clAddr cci_rd_req_addr; -reg cci_rd_req_enable, cci_rd_req_wait; +wire cci_rd_req_fire; +t_ccip_clAddr cci_rd_req_addr; +reg cci_rd_req_valid, cci_rd_req_wait; wire cci_rdq_push, cci_rdq_pop; wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din; wire cci_rdq_empty; always @(*) begin + af2cp_sTxPort.c0.valid = cci_rd_req_fire; af2cp_sTxPort.c0.hdr = t_ccip_c0_ReqMemHdr'(0); af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr; af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(cci_rd_req_tag); end -wire cci_dram_wr_req_fire = cci_dram_wr_req_valid && cci_dram_req_ready; - -wire cci_rd_req_fire = af2cp_sTxPort.c0.valid; +wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready; wire cci_rd_rsp_fire = (STATE_WRITE == state) && cp2af_sRxPort.c0.rspValid @@ -631,10 +702,8 @@ wire cci_rd_rsp_fire = (STATE_WRITE == state) assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr); assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata); -assign cci_rd_req_ctr_next = cci_rd_req_ctr + DRAM_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0); - -assign cci_rdq_pop = cci_dram_wr_req_fire; assign cci_rdq_push = cci_rd_rsp_fire; +assign cci_rdq_pop = cci_mem_wr_req_fire; assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag}; wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads; @@ -646,79 +715,80 @@ VX_pending_size #( .reset (reset), .push (cci_rd_req_fire), .pop (cci_rdq_pop), - `UNUSED_PIN (empty), .full (cci_pending_reads_full), - .size (cci_pending_reads) + .size (cci_pending_reads), + `UNUSED_PIN (empty) ); `UNUSED_VAR (cci_pending_reads) -assign cci_dram_wr_req_valid = !cci_rdq_empty; +assign cci_rd_req_ctr_next = cci_rd_req_ctr + CCI_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0); -assign cci_dram_wr_req_addr = cci_dram_wr_req_addr_unqual + (DRAM_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout))); +assign cci_rd_req_fire = cci_rd_req_valid && !(cci_rd_req_wait || cci_pending_reads_full); + +assign cci_mem_wr_req_valid = !cci_rdq_empty; + +assign cci_mem_wr_req_addr = cci_mem_wr_req_addr_unqual + (CCI_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout))); -assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait; - -assign cmd_write_done = (cci_dram_wr_req_ctr == cmd_data_size); +assign cmd_write_done = (cci_mem_wr_req_ctr == cmd_data_size); // Send read requests to CCI always @(posedge clk) begin if (reset) begin - cci_rd_req_addr <= 0; - cci_rd_req_ctr <= 0; - cci_rd_rsp_ctr <= 0; - cci_rd_req_enable <= 0; - cci_rd_req_wait <= 0; - cci_dram_wr_req_ctr <= 0; - cci_dram_wr_req_addr_unqual <= 0; - end - else begin + cci_rd_req_valid <= 0; + cci_rd_req_wait <= 0; + end else begin if ((STATE_IDLE == state) && (CMD_MEM_WRITE == cmd_type)) begin - cci_rd_req_addr <= cmd_io_addr; - cci_rd_req_ctr <= 0; - cci_rd_rsp_ctr <= 0; - cci_rd_req_enable <= (cmd_data_size != 0); - cci_rd_req_wait <= 0; - cci_dram_wr_req_ctr <= 0; - cci_dram_wr_req_addr_unqual <= cmd_mem_addr; + cci_rd_req_valid <= (cmd_data_size != 0); + cci_rd_req_wait <= 0; end - cci_rd_req_enable <= (STATE_WRITE == state) - && (cci_rd_req_ctr_next != cmd_data_size) - && !cci_pending_reads_full - && !cp2af_sRxPort.c0TxAlmFull; + cci_rd_req_valid <= (STATE_WRITE == state) + && (cci_rd_req_ctr_next != cmd_data_size) + && !cp2af_sRxPort.c0TxAlmFull; - if (cci_rd_req_fire) begin - cci_rd_req_addr <= cci_rd_req_addr + 1; - cci_rd_req_ctr <= cci_rd_req_ctr_next; - if (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin - cci_rd_req_wait <= 1; // end current request batch - end - `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads); - `endif + if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin + cci_rd_req_wait <= 1; // end current request batch end - if (cci_rd_rsp_fire) begin - cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1); - if (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin - cci_rd_req_wait <= 0; // restart new request batch - end - `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); - `endif - end + if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin + cci_rd_req_wait <= 0; // begin new request batch + end + end - /*if (cci_rdq_pop) begin - `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads); - `endif - end*/ + if ((STATE_IDLE == state) + && (CMD_MEM_WRITE == cmd_type)) begin + cci_rd_req_addr <= cmd_io_addr; + cci_rd_req_ctr <= 0; + cci_rd_rsp_ctr <= 0; + cci_mem_wr_req_ctr <= 0; + cci_mem_wr_req_addr_unqual <= cmd_mem_addr; + end - if (cci_dram_wr_req_fire) begin - cci_dram_wr_req_addr_unqual <= cci_dram_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_dram_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0)); - cci_dram_wr_req_ctr <= cci_dram_wr_req_ctr + DRAM_ADDR_WIDTH'(1); - end + if (cci_rd_req_fire) begin + cci_rd_req_addr <= cci_rd_req_addr + 1; + cci_rd_req_ctr <= cci_rd_req_ctr + 1; + `ifdef DBG_PRINT_OPAE + $display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads); + `endif + end + + if (cci_rd_rsp_fire) begin + cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1); + `ifdef DBG_PRINT_OPAE + $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); + `endif + end + + if (cci_rdq_pop) begin + `ifdef DBG_PRINT_OPAE + $display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads); + `endif + end + + if (cci_mem_wr_req_fire) begin + cci_mem_wr_req_addr_unqual <= cci_mem_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : CCI_ADDR_WIDTH'(0)); + cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1); end end @@ -761,22 +831,24 @@ VX_fifo_queue #( // CCI-P Write Request ////////////////////////////////////////////////////////// -reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_ctr; -reg [DRAM_ADDR_WIDTH-1:0] cci_wr_req_ctr; -reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr_r; +reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr; +reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr; +reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr; + +reg cci_wr_req_fire; t_ccip_clAddr cci_wr_req_addr; +t_ccip_clData cci_wr_req_data; always @(*) begin + af2cp_sTxPort.c1.valid = cci_wr_req_fire; af2cp_sTxPort.c1.hdr = t_ccip_c1_ReqMemHdr'(0); - af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr; af2cp_sTxPort.c1.hdr.sop = 1; // single line write mode - af2cp_sTxPort.c1.data = t_ccip_clData'(cci_dram_rsp_data); + af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr; + af2cp_sTxPort.c1.data = cci_wr_req_data; end -wire cci_dram_rd_req_fire = cci_dram_rd_req_valid && cci_dram_req_ready; -wire cci_dram_rd_rsp_fire = cci_dram_rsp_valid && cci_dram_rsp_ready; - -wire cci_wr_req_fire = cci_dram_rd_rsp_fire; +wire cci_mem_rd_req_fire = cci_mem_rd_req_valid && cci_mem_req_ready; +wire cci_mem_rd_rsp_fire = cci_mem_rsp_valid && cci_mem_rsp_ready; wire cci_wr_rsp_fire = (STATE_READ == state) && cp2af_sRxPort.c1.rspValid @@ -785,12 +857,13 @@ wire cci_wr_rsp_fire = (STATE_READ == state) wire [$clog2(CCI_RW_PENDING_SIZE+1)-1:0] cci_pending_writes; wire cci_pending_writes_empty; wire cci_pending_writes_full; + VX_pending_size #( .SIZE (CCI_RW_PENDING_SIZE) ) cci_wr_pending_size ( .clk (clk), .reset (reset), - .push (cci_wr_req_fire), + .push (cci_mem_rd_rsp_fire), .pop (cci_wr_rsp_fire), .empty (cci_pending_writes_empty), .full (cci_pending_writes_full), @@ -798,54 +871,61 @@ VX_pending_size #( ); `UNUSED_VAR (cci_pending_writes) -assign cci_dram_rd_req_valid = (cci_dram_rd_req_ctr != 0); -assign cci_dram_rd_req_addr = cci_dram_rd_req_addr_r; +assign cci_mem_rd_req_valid = (STATE_READ == state) + && (cci_mem_rd_req_ctr != cmd_data_size); -assign af2cp_sTxPort.c1.valid = cci_dram_rd_rsp_fire; -assign cci_dram_rsp_ready = !cp2af_sRxPort.c1TxAlmFull && !cci_pending_writes_full; +assign cci_mem_rsp_ready = !cp2af_sRxPort.c1TxAlmFull + && !cci_pending_writes_full; -assign cmd_read_done = (0 == cci_wr_req_ctr) && cci_pending_writes_empty; +assign cmd_read_done = (0 == cci_wr_req_ctr) + && cci_pending_writes_empty; // Send write requests to CCI always @(posedge clk) begin if (reset) begin - cci_wr_req_addr <= 0; - cci_wr_req_ctr <= 0; - cci_dram_rd_req_ctr <= 0; - cci_dram_rd_req_addr_r <= 0; + cci_wr_req_fire <= 0; + end else begin + cci_wr_req_fire <= cci_mem_rd_rsp_fire; end - else begin - if ((STATE_IDLE == state) - && (CMD_MEM_READ == cmd_type)) begin - cci_wr_req_addr <= cmd_io_addr; - cci_wr_req_ctr <= cmd_data_size; - cci_dram_rd_req_ctr <= cmd_data_size; - cci_dram_rd_req_addr_r <= cmd_mem_addr; - end + + if ((STATE_IDLE == state) + && (CMD_MEM_READ == cmd_type)) begin + cci_mem_rd_req_ctr <= 0; + cci_mem_rd_req_addr <= cmd_mem_addr; + cci_wr_req_ctr <= cmd_data_size; + end - if (cci_wr_req_fire) begin - assert(cci_wr_req_ctr != 0); - cci_wr_req_addr <= cci_wr_req_addr + t_ccip_clAddr'(1); - cci_wr_req_ctr <= cci_wr_req_ctr - DRAM_ADDR_WIDTH'(1); - `ifdef DBG_PRINT_OPAE - $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data); - `endif - end + if (cci_mem_rd_req_fire) begin + cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1); + cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1); + end - /*`ifdef DBG_PRINT_OPAE - if (cci_wr_rsp_fire) begin - $display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes); - end - `endif*/ + cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag); + cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data); - if (cci_dram_rd_req_fire) begin - cci_dram_rd_req_addr_r <= cci_dram_rd_req_addr_r + DRAM_ADDR_WIDTH'(1); - cci_dram_rd_req_ctr <= cci_dram_rd_req_ctr - DRAM_ADDR_WIDTH'(1); - end + if (cci_wr_req_fire) begin + assert(cci_wr_req_ctr != 0); + cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1); + `ifdef DBG_PRINT_OPAE + $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data); + `endif + end + + if (cci_wr_rsp_fire) begin + `ifdef DBG_PRINT_OPAE + $display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes); + `endif end end +//-- + +assign cci_mem_req_rw = (CMD_MEM_WRITE == state); +assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_req_valid; +assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr; +assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr; + // CSRs /////////////////////////////////////////////////////////////////////// reg csr_io_req_sent; @@ -890,20 +970,20 @@ Vortex #() vortex ( .clk (clk), .reset (reset | vx_reset), - // DRAM request - .dram_req_valid (vx_dram_req_valid), - .dram_req_rw (vx_dram_req_rw), - .dram_req_byteen(vx_dram_req_byteen), - .dram_req_addr (vx_dram_req_addr), - .dram_req_data (vx_dram_req_data), - .dram_req_tag (vx_dram_req_tag), - .dram_req_ready (vx_dram_req_ready), + // Memory request + .mem_req_valid (vx_mem_req_valid), + .mem_req_rw (vx_mem_req_rw), + .mem_req_byteen (vx_mem_req_byteen), + .mem_req_addr (vx_mem_req_addr), + .mem_req_data (vx_mem_req_data), + .mem_req_tag (vx_mem_req_tag), + .mem_req_ready (vx_mem_req_ready), - // DRAM response - .dram_rsp_valid (vx_dram_rsp_valid), - .dram_rsp_data (vx_dram_rsp_data), - .dram_rsp_tag (vx_dram_rsp_tag), - .dram_rsp_ready (vx_dram_rsp_ready), + // Memory response + .mem_rsp_valid (vx_mem_rsp_valid), + .mem_rsp_data (vx_mem_rsp_data), + .mem_rsp_tag (vx_mem_rsp_tag), + .mem_rsp_ready (vx_mem_rsp_ready), // CSR Request .csr_req_valid (vx_csr_io_req_valid), @@ -944,16 +1024,15 @@ Vortex #() vortex ( `SCOPE_ASSIGN (cci_sTxPort_c2_mmioRdValid, af2cp_sTxPort.c2.mmioRdValid); `SCOPE_ASSIGN (cci_sRxPort_c0TxAlmFull, cp2af_sRxPort.c0TxAlmFull); `SCOPE_ASSIGN (cci_sRxPort_c1TxAlmFull, cp2af_sRxPort.c1TxAlmFull); -`SCOPE_ASSIGN (avs_address, avs_address); -`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest); -`SCOPE_ASSIGN (avs_write_fire, avs_write && !avs_waitrequest); -`SCOPE_ASSIGN (avs_read_fire, avs_read && !avs_waitrequest); -`SCOPE_ASSIGN (avs_byteenable, avs_byteenable); -`SCOPE_ASSIGN (avs_burstcount, avs_burstcount); -`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid); -`SCOPE_ASSIGN (mem_bank_select, mem_bank_select); -`SCOPE_ASSIGN (cci_dram_rd_req_ctr, cci_dram_rd_req_ctr); -`SCOPE_ASSIGN (cci_dram_wr_req_ctr, cci_dram_wr_req_ctr); +`SCOPE_ASSIGN (avs_address, avs_address[0]); +`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest[0]); +`SCOPE_ASSIGN (avs_write_fire, avs_write[0] && !avs_waitrequest[0]); +`SCOPE_ASSIGN (avs_read_fire, avs_read[0] && !avs_waitrequest[0]); +`SCOPE_ASSIGN (avs_byteenable, avs_byteenable[0]); +`SCOPE_ASSIGN (avs_burstcount, avs_burstcount[0]); +`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid[0]); +`SCOPE_ASSIGN (cci_mem_rd_req_ctr, cci_mem_rd_req_ctr); +`SCOPE_ASSIGN (cci_mem_wr_req_ctr, cci_mem_wr_req_ctr); `SCOPE_ASSIGN (cci_rd_req_ctr, cci_rd_req_ctr); `SCOPE_ASSIGN (cci_rd_rsp_ctr, cci_rd_rsp_ctr); `SCOPE_ASSIGN (cci_wr_req_ctr, cci_wr_req_ctr); @@ -964,11 +1043,11 @@ Vortex #() vortex ( `SCOPE_ASSIGN (cci_pending_reads_full, cci_pending_reads_full); `SCOPE_ASSIGN (cci_pending_writes_empty, cci_pending_writes_empty); `SCOPE_ASSIGN (cci_pending_writes_full, cci_pending_writes_full); -`SCOPE_ASSIGN (afu_dram_req_fire, (dram_req_valid && dram_req_ready)); -`SCOPE_ASSIGN (afu_dram_req_addr, dram_req_addr); -`SCOPE_ASSIGN (afu_dram_req_tag, dram_req_tag); -`SCOPE_ASSIGN (afu_dram_rsp_fire, (dram_rsp_valid && dram_rsp_ready)); -`SCOPE_ASSIGN (afu_dram_rsp_tag, dram_rsp_tag); +`SCOPE_ASSIGN (afu_mem_req_fire, (mem_req_valid && mem_req_ready)); +`SCOPE_ASSIGN (afu_mem_req_addr, mem_req_addr); +`SCOPE_ASSIGN (afu_mem_req_tag, mem_req_tag); +`SCOPE_ASSIGN (afu_mem_rsp_fire, (mem_rsp_valid && mem_rsp_ready)); +`SCOPE_ASSIGN (afu_mem_rsp_tag, mem_rsp_tag); wire scope_changed = `SCOPE_TRIGGER; diff --git a/hw/rtl/afu/vortex_afu.vh b/hw/rtl/afu/vortex_afu.vh index 691d488c..c92a3a32 100644 --- a/hw/rtl/afu/vortex_afu.vh +++ b/hw/rtl/afu/vortex_afu.vh @@ -1,18 +1,27 @@ `ifndef __VORTEX_AFU__ `define __VORTEX_AFU__ -`IGNORE_WARNINGS_BEGIN `include "ccip_if_pkg.sv" -`IGNORE_WARNINGS_END `define PLATFORM_PROVIDES_LOCAL_MEMORY -`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26 -`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512 -`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4 -`IGNORE_WARNINGS_BEGIN +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS +`define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2 +`endif + +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26 +`endif + +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512 +`endif + +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4 +`endif + `include "local_mem_cfg_pkg.sv" -`IGNORE_WARNINGS_END `define AFU_ACCEL_NAME "vortex_afu" `define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index acf2d0cc..a781c681 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -22,8 +22,8 @@ module VX_bank #( parameter CREQ_SIZE = 1, // Miss Reserv Queue Knob parameter MSHR_SIZE = 1, - // DRAM Request Queue Size - parameter DREQ_SIZE = 1, + // Memory Request Queue Size + parameter MREQ_SIZE = 1, // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -35,10 +35,7 @@ module VX_bank #( parameter CORE_TAG_ID_BITS = 0, // bank offset from beginning of index range - parameter BANK_ADDR_OFFSET = 0, - - // in-order DRAN - parameter IN_ORDER_DRAM = 0 + parameter BANK_ADDR_OFFSET = 0 ) ( `SCOPE_IO_VX_bank @@ -71,19 +68,19 @@ module VX_bank #( output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready, - // DRAM request - output wire dram_req_valid, - output wire dram_req_rw, - output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen, - output wire [`LINE_ADDR_WIDTH-1:0] dram_req_addr, - output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data, - input wire dram_req_ready, + // Memory request + output wire mem_req_valid, + output wire mem_req_rw, + output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen, + output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr, + output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data, + input wire mem_req_ready, - // DRAM response - input wire dram_rsp_valid, - input wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr, - input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data, - output wire dram_rsp_ready, + // Memory response + input wire mem_rsp_valid, + input wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr, + input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data, + output wire mem_rsp_ready, // flush input wire flush_enable, @@ -93,10 +90,10 @@ module VX_bank #( `UNUSED_PARAM (CORE_TAG_ID_BITS) `ifdef DBG_CACHE_REQ_INFO - /* verilator lint_off UNUSED */ +`IGNORE_WARNINGS_BEGIN wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1; wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1; - /* verilator lint_on UNUSED */ +`IGNORE_WARNINGS_END `endif wire creq_pop; @@ -167,8 +164,8 @@ module VX_bank #( wire is_flush_st0; wire crsq_in_valid, crsq_in_ready, crsq_in_stall; - wire dreq_alm_full; - wire drsq_pop; + wire mreq_alm_full; + wire mrsq_pop; wire crsq_in_fire = crsq_in_valid && crsq_in_ready; @@ -186,24 +183,24 @@ module VX_bank #( // determine which queue to pop next in priority order wire mshr_pop_unqual = mshr_valid - && !dreq_alm_full; // ensure DRAM request queue not full (deadlock prevention) - wire drsq_pop_unqual = !mshr_pop_unqual && dram_rsp_valid; - wire creq_pop_unqual = !mshr_pop_unqual && !drsq_pop_unqual && !creq_empty && !flush_enable; + && !mreq_alm_full; // ensure memory request queue not full (deadlock prevention) + wire mrsq_pop_unqual = !mshr_pop_unqual && mem_rsp_valid; + wire creq_pop_unqual = !mshr_pop_unqual && !mrsq_pop_unqual && !creq_empty && !flush_enable; wire is_miss_st1 = valid_st1 && (miss_st1 || force_miss_st1); assign mshr_pop = mshr_pop_unqual - && !(!IN_ORDER_DRAM && is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed + && !(is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed && !crsq_in_stall; // ensure core response ready - assign drsq_pop = drsq_pop_unqual + assign mrsq_pop = mrsq_pop_unqual && !crsq_in_stall; // ensure core response ready assign creq_pop = creq_pop_unqual - && !dreq_alm_full // ensure dram request ready + && !mreq_alm_full // ensure memory request ready && !mshr_alm_full // ensure mshr enqueue ready && !crsq_in_stall; // ensure core response ready - assign dram_rsp_ready = drsq_pop; + assign mem_rsp_ready = mrsq_pop; // we have a miss in mshr or entering it for the current address wire mshr_pending_sel = mshr_pending @@ -237,15 +234,7 @@ module VX_bank #( end else begin assign creq_line_data = creq_data; end - - wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr_qual; - if (IN_ORDER_DRAM) begin - `UNUSED_VAR (dram_rsp_addr) - assign dram_rsp_addr_qual = mshr_addr; - end else begin - assign dram_rsp_addr_qual = dram_rsp_addr; - end - + VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + 1 + 1), .RESETW (1) @@ -254,13 +243,13 @@ module VX_bank #( .reset (reset), .enable (!crsq_in_stall), .data_in ({ - flush_enable || mshr_pop || drsq_pop || creq_pop, + flush_enable || mshr_pop || mrsq_pop || creq_pop, flush_enable, mshr_pop_unqual, - drsq_pop_unqual || flush_enable, + mrsq_pop_unqual || flush_enable, mshr_pop_unqual ? 1'b0 : creq_rw, - mshr_pop_unqual ? mshr_addr : (dram_rsp_valid ? dram_rsp_addr_qual : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), - dram_rsp_valid ? dram_rsp_data : creq_line_data, + mshr_pop_unqual ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)), + mem_rsp_valid ? mem_rsp_data : creq_line_data, mshr_pop_unqual ? mshr_wsel : creq_wsel, mshr_pop_unqual ? mshr_byteen : creq_byteen, mshr_pop_unqual ? mshr_tid : creq_tid, @@ -307,7 +296,7 @@ module VX_bank #( ); // redundant fills - wire is_redundant_fill_st0 = !IN_ORDER_DRAM && is_fill_st0 && tag_match_st0; + wire is_redundant_fill_st0 = is_fill_st0 && tag_match_st0; // we had a miss with prior request for the current address assign prev_miss_dep_st0 = is_miss_st1 && (addr_st0 == addr_st1); @@ -322,9 +311,9 @@ module VX_bank #( assign writeen_unqual_st0 = (WRITE_ENABLE && !is_fill_st0 && tag_match_st0 && mem_rw_st0) || (is_fill_st0 && !is_redundant_fill_st0); - assign incoming_fill_st0 = dram_rsp_valid && (addr_st0 == dram_rsp_addr_qual); + assign incoming_fill_st0 = mem_rsp_valid && (addr_st0 == mem_rsp_addr); - assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (!IN_ORDER_DRAM && is_mshr_st0 && !prev_miss_dep_st0)); + assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (is_mshr_st0 && !prev_miss_dep_st0)); VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH), @@ -351,12 +340,12 @@ module VX_bank #( wire mshr_push_st1 = !is_fill_st1 && !mem_rw_st1 && (miss_st1 || force_miss_st1); - wire incoming_fill_qual_st1 = (dram_rsp_valid && (addr_st1 == dram_rsp_addr_qual)) + wire incoming_fill_qual_st1 = (mem_rsp_valid && (addr_st1 == mem_rsp_addr)) || incoming_fill_st1; wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1; - wire dreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1) + wire mreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1) || do_writeback_st1; wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1; @@ -408,15 +397,14 @@ module VX_bank #( assign mshr_push = valid_st1 && mshr_push_st1; wire mshr_dequeue = valid_st1 && is_mshr_st1 && !mshr_push_st1 && crsq_in_ready; - wire mshr_restore = !IN_ORDER_DRAM && is_mshr_st1; - `RUNTIME_ASSERT(!IN_ORDER_DRAM || !(mshr_push && mshr_restore), ("Oops!")) + wire mshr_restore = is_mshr_st1; // push a missed request as 'ready' if it was a forced miss that actually had a hit // or the fill request for this block is comming wire mshr_init_ready_state = !miss_st1 || incoming_fill_qual_st1; - // use dram rsp or core req address to lookup the mshr - wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = dram_rsp_valid ? dram_rsp_addr_qual : creq_addr; + // use memory rsp or core req address to lookup the mshr + wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = mem_rsp_valid ? mem_rsp_addr : creq_addr; VX_miss_resrv #( .BANK_ID (BANK_ID), @@ -450,7 +438,7 @@ module VX_bank #( `UNUSED_PIN (enqueue_full), // lookup - .lookup_ready (drsq_pop), + .lookup_ready (mrsq_pop), .lookup_addr (lookup_addr), .lookup_match (mshr_pending), @@ -500,41 +488,41 @@ module VX_bank #( .ready_out (core_rsp_ready) ); - // Enqueue DRAM request + // Enqueue memory request - wire [CACHE_LINE_SIZE-1:0] dreq_byteen; - wire [`LINE_ADDR_WIDTH-1:0] dreq_addr; - wire [`CACHE_LINE_WIDTH-1:0] dreq_data; - wire dreq_push, dreq_pop, dreq_empty, dreq_rw; + wire [CACHE_LINE_SIZE-1:0] mreq_byteen; + wire [`LINE_ADDR_WIDTH-1:0] mreq_addr; + wire [`CACHE_LINE_WIDTH-1:0] mreq_data; + wire mreq_push, mreq_pop, mreq_empty, mreq_rw; - assign dreq_push = valid_st1 && dreq_push_st1; + assign mreq_push = valid_st1 && mreq_push_st1; - assign dreq_pop = dram_req_valid && dram_req_ready; + assign mreq_pop = mem_req_valid && mem_req_ready; - assign dreq_rw = WRITE_ENABLE && do_writeback_st1; - assign dreq_byteen = dreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}}; - assign dreq_addr = addr_st1; - assign dreq_data = wdata_st1; + assign mreq_rw = WRITE_ENABLE && do_writeback_st1; + assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}}; + assign mreq_addr = addr_st1; + assign mreq_data = wdata_st1; VX_fifo_queue #( .DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH), - .SIZE (DREQ_SIZE), - .ALM_FULL (DREQ_SIZE-2) - ) dram_req_queue ( + .SIZE (MREQ_SIZE), + .ALM_FULL (MREQ_SIZE-2) + ) mem_req_queue ( .clk (clk), .reset (reset), - .push (dreq_push), - .pop (dreq_pop), - .data_in ({dreq_rw, dreq_byteen, dreq_addr, dreq_data}), - .data_out ({dram_req_rw, dram_req_byteen, dram_req_addr, dram_req_data}), - .empty (dreq_empty), - .alm_full (dreq_alm_full), + .push (mreq_push), + .pop (mreq_pop), + .data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_data}), + .data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data}), + .empty (mreq_empty), + .alm_full (mreq_alm_full), `UNUSED_PIN (full), `UNUSED_PIN (alm_empty), `UNUSED_PIN (size) ); - assign dram_req_valid = !dreq_empty; + assign mem_req_valid = !mreq_empty; `SCOPE_ASSIGN (valid_st0, valid_st0); `SCOPE_ASSIGN (valid_st1, valid_st1); @@ -544,7 +532,7 @@ module VX_bank #( `SCOPE_ASSIGN (force_miss_st0, force_miss_st0); `SCOPE_ASSIGN (mshr_push, mshr_push); `SCOPE_ASSIGN (crsq_in_stall, crsq_in_stall); - `SCOPE_ASSIGN (dreq_alm_full, dreq_alm_full); + `SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full); `SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full); `SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID)); `SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); @@ -552,45 +540,45 @@ module VX_bank #( `ifdef PERF_ENABLE assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1; assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1; - assign perf_pipe_stalls = crsq_in_stall || dreq_alm_full || mshr_alm_full; + assign perf_pipe_stalls = crsq_in_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif `ifdef DBG_PRINT_CACHE_BANK always @(posedge clk) begin - /*if (valid_st1 && pmask_st1 == {NUM_PORTS{1'b1}}) begin - $display("%t: cache%0d:%0d full bank multi-porting - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); - end*/ + /*if (crsq_in_fire && (NUM_PORTS > 1) && $countones(crsq_pmask) > 1) begin + $display("%t: *** cache%0d:%0d multi-port-out: pmask=%b, addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, crsq_pmask, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag); + end */ if (valid_st1 && !is_fill_st1 && miss_st1 && incoming_fill_qual_st1) begin - $display("%t: cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); + $display("%t: *** cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); assert(!is_mshr_st1); end - if (crsq_in_stall || dreq_alm_full || mshr_alm_full) begin - $display("%t: cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, dreq_alm_full, mshr_alm_full); + if (crsq_in_stall || mreq_alm_full || mshr_alm_full) begin + $display("%t: *** cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, mreq_alm_full, mshr_alm_full); end if (flush_enable) begin $display("%t: cache%0d:%0d flush: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID)); end - if (drsq_pop) begin - $display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_rsp_addr_qual, BANK_ID), dram_rsp_data); + if (mrsq_pop) begin + $display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_data); end if (mshr_pop) begin - $display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel); + $display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel); end if (creq_pop) begin if (creq_rw) - $display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); + $display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); else - $display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); + $display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); end if (crsq_in_fire) begin - $display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); + $display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); end - if (dreq_push) begin + if (mreq_push) begin if (do_writeback_st1) - $display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), dreq_data, dreq_byteen, debug_wid_st1, debug_pc_st1); + $display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); else - $display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1); + $display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1); end end `endif diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 689876c0..f3ade5e9 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -21,10 +21,10 @@ module VX_cache #( parameter CREQ_SIZE = 4, // Miss Reserv Queue Knob parameter MSHR_SIZE = 8, - // DRAM Response Queue Size - parameter DRSQ_SIZE = 4, - // DRAM Request Queue Size - parameter DREQ_SIZE = 4, + // Memory Response Queue Size + parameter MRSQ_SIZE = 4, + // Memory Request Queue Size + parameter MREQ_SIZE = 4, // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -35,22 +35,17 @@ module VX_cache #( // size of tag id in core request tag parameter CORE_TAG_ID_BITS = CORE_TAG_WIDTH, - // dram request tag size - parameter DRAM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)), + // Memory request tag size + parameter MEM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)), // bank offset from beginning of index range - parameter BANK_ADDR_OFFSET = 0, - - // in-order DRAN - parameter IN_ORDER_DRAM = 0 + parameter BANK_ADDR_OFFSET = 0 ) ( `SCOPE_IO_VX_cache input wire clk, input wire reset, - input wire flush, - // Core request input wire [NUM_REQS-1:0] core_req_valid, input wire [NUM_REQS-1:0] core_req_rw, @@ -66,29 +61,32 @@ module VX_cache #( output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready, + // Memory request + output wire mem_req_valid, + output wire mem_req_rw, + output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen, + output wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr, + output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data, + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, + input wire mem_req_ready, + + // Memory response + input wire mem_rsp_valid, + input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data, + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, + output wire mem_rsp_ready, + // PERF `ifdef PERF_ENABLE VX_perf_cache_if perf_cache_if, `endif - // DRAM request - output wire dram_req_valid, - output wire dram_req_rw, - output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen, - output wire [`DRAM_ADDR_WIDTH-1:0] dram_req_addr, - output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data, - output wire [DRAM_TAG_WIDTH-1:0] dram_req_tag, - input wire dram_req_ready, - - // DRAM response - input wire dram_rsp_valid, - input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data, - input wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag, - output wire dram_rsp_ready + // device flush + input wire flush ); `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value")) - + wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; @@ -106,17 +104,17 @@ module VX_cache #( wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag; wire [NUM_BANKS-1:0] per_bank_core_rsp_ready; - wire [NUM_BANKS-1:0] per_bank_dram_req_valid; - wire [NUM_BANKS-1:0] per_bank_dram_req_rw; - wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_dram_req_byteen; - wire [NUM_BANKS-1:0][`DRAM_ADDR_WIDTH-1:0] per_bank_dram_req_addr; - wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_dram_req_data; - wire [NUM_BANKS-1:0] per_bank_dram_req_ready; + wire [NUM_BANKS-1:0] per_bank_mem_req_valid; + wire [NUM_BANKS-1:0] per_bank_mem_req_rw; + wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen; + wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; + wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data; + wire [NUM_BANKS-1:0] per_bank_mem_req_ready; - wire [NUM_BANKS-1:0] per_bank_dram_rsp_ready; + wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; - wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data_qual; - wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag_qual; + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual; + wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_qual; wire [`LINE_SELECT_BITS-1:0] flush_addr; wire flush_enable; @@ -129,35 +127,35 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// - wire drsq_full, drsq_empty; - wire drsq_push, drsq_pop; + wire mrsq_full, mrsq_empty; + wire mrsq_push, mrsq_pop; - assign drsq_push = dram_rsp_valid && dram_rsp_ready; - assign dram_rsp_ready = !drsq_full; + assign mrsq_push = mem_rsp_valid && mem_rsp_ready; + assign mem_rsp_ready = !mrsq_full; VX_fifo_queue #( - .DATAW (DRAM_TAG_WIDTH + `CACHE_LINE_WIDTH), - .SIZE (DRSQ_SIZE), + .DATAW (MEM_TAG_WIDTH + `CACHE_LINE_WIDTH), + .SIZE (MRSQ_SIZE), .BUFFERED (1) - ) dram_rsp_queue ( + ) mem_rsp_queue ( .clk (clk), .reset (reset), - .push (drsq_push), - .pop (drsq_pop), - .data_in ({dram_rsp_tag, dram_rsp_data}), - .data_out ({dram_rsp_tag_qual, dram_rsp_data_qual}), - .empty (drsq_empty), - .full (drsq_full), + .push (mrsq_push), + .pop (mrsq_pop), + .data_in ({mem_rsp_tag, mem_rsp_data}), + .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), + .empty (mrsq_empty), + .full (mrsq_full), `UNUSED_PIN (alm_full), `UNUSED_PIN (alm_empty), `UNUSED_PIN (size) ); if (NUM_BANKS == 1) begin - `UNUSED_VAR (dram_rsp_tag_qual) - assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready; + `UNUSED_VAR (mem_rsp_tag_qual) + assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready; end else begin - assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready[`DRAM_ADDR_BANK(dram_rsp_tag_qual)]; + assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready[`MEM_ADDR_BANK(mem_rsp_tag_qual)]; end /////////////////////////////////////////////////////////////////////////// @@ -176,6 +174,7 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// VX_cache_core_req_bank_sel #( + .CACHE_ID (CACHE_ID), .CACHE_LINE_SIZE (CACHE_LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_PORTS (NUM_PORTS), @@ -227,17 +226,17 @@ module VX_cache #( wire [CORE_TAG_WIDTH-1:0] curr_bank_core_rsp_tag; wire curr_bank_core_rsp_ready; - wire curr_bank_dram_req_valid; - wire curr_bank_dram_req_rw; - wire [CACHE_LINE_SIZE-1:0] curr_bank_dram_req_byteen; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_req_addr; - wire[`CACHE_LINE_WIDTH-1:0] curr_bank_dram_req_data; - wire curr_bank_dram_req_ready; + wire curr_bank_mem_req_valid; + wire curr_bank_mem_req_rw; + wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; + wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data; + wire curr_bank_mem_req_ready; - wire curr_bank_dram_rsp_valid; - wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_rsp_addr; - wire [`CACHE_LINE_WIDTH-1:0] curr_bank_dram_rsp_data; - wire curr_bank_dram_rsp_ready; + wire curr_bank_mem_rsp_valid; + wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_rsp_addr; + wire [`CACHE_LINE_WIDTH-1:0] curr_bank_mem_rsp_data; + wire curr_bank_mem_rsp_ready; // Core Req assign curr_bank_core_req_valid = per_bank_core_req_valid[i]; @@ -258,28 +257,28 @@ module VX_cache #( assign per_bank_core_rsp_tag [i] = curr_bank_core_rsp_tag; assign per_bank_core_rsp_data [i] = curr_bank_core_rsp_data; - // DRAM request - assign per_bank_dram_req_valid[i] = curr_bank_dram_req_valid; - assign per_bank_dram_req_rw[i] = curr_bank_dram_req_rw; - assign per_bank_dram_req_byteen[i] = curr_bank_dram_req_byteen; + // Memory request + assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid; + assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw; + assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen; if (NUM_BANKS == 1) begin - assign per_bank_dram_req_addr[i] = curr_bank_dram_req_addr; + assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr; end else begin - assign per_bank_dram_req_addr[i] = `LINE_TO_DRAM_ADDR(curr_bank_dram_req_addr, i); + assign per_bank_mem_req_addr[i] = `LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i); end - assign per_bank_dram_req_data[i] = curr_bank_dram_req_data; - assign curr_bank_dram_req_ready = per_bank_dram_req_ready[i]; + assign per_bank_mem_req_data[i] = curr_bank_mem_req_data; + assign curr_bank_mem_req_ready = per_bank_mem_req_ready[i]; - // DRAM response + // Memory response if (NUM_BANKS == 1) begin - assign curr_bank_dram_rsp_valid = !drsq_empty; - assign curr_bank_dram_rsp_addr = dram_rsp_tag_qual; + assign curr_bank_mem_rsp_valid = !mrsq_empty; + assign curr_bank_mem_rsp_addr = mem_rsp_tag_qual; end else begin - assign curr_bank_dram_rsp_valid = !drsq_empty && (`DRAM_ADDR_BANK(dram_rsp_tag_qual) == i); - assign curr_bank_dram_rsp_addr = `DRAM_TO_LINE_ADDR(dram_rsp_tag_qual); + assign curr_bank_mem_rsp_valid = !mrsq_empty && (`MEM_ADDR_BANK(mem_rsp_tag_qual) == i); + assign curr_bank_mem_rsp_addr = `MEM_TO_LINE_ADDR(mem_rsp_tag_qual); end - assign curr_bank_dram_rsp_data = dram_rsp_data_qual; - assign per_bank_dram_rsp_ready[i] = curr_bank_dram_rsp_ready; + assign curr_bank_mem_rsp_data = mem_rsp_data_qual; + assign per_bank_mem_rsp_ready[i] = curr_bank_mem_rsp_ready; VX_bank #( .BANK_ID (i), @@ -292,12 +291,11 @@ module VX_cache #( .NUM_REQS (NUM_REQS), .CREQ_SIZE (CREQ_SIZE), .MSHR_SIZE (MSHR_SIZE), - .DREQ_SIZE (DREQ_SIZE), + .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS), - .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET), - .IN_ORDER_DRAM (IN_ORDER_DRAM) + .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) ) bank ( `SCOPE_BIND_VX_cache_bank(i) @@ -330,19 +328,19 @@ module VX_cache #( .core_rsp_tag (curr_bank_core_rsp_tag), .core_rsp_ready (curr_bank_core_rsp_ready), - // DRAM request - .dram_req_valid (curr_bank_dram_req_valid), - .dram_req_rw (curr_bank_dram_req_rw), - .dram_req_byteen (curr_bank_dram_req_byteen), - .dram_req_addr (curr_bank_dram_req_addr), - .dram_req_data (curr_bank_dram_req_data), - .dram_req_ready (curr_bank_dram_req_ready), + // Memory request + .mem_req_valid (curr_bank_mem_req_valid), + .mem_req_rw (curr_bank_mem_req_rw), + .mem_req_byteen (curr_bank_mem_req_byteen), + .mem_req_addr (curr_bank_mem_req_addr), + .mem_req_data (curr_bank_mem_req_data), + .mem_req_ready (curr_bank_mem_req_ready), - // DRAM response - .dram_rsp_valid (curr_bank_dram_rsp_valid), - .dram_rsp_addr (curr_bank_dram_rsp_addr), - .dram_rsp_data (curr_bank_dram_rsp_data), - .dram_rsp_ready (curr_bank_dram_rsp_ready), + // Memory response + .mem_rsp_valid (curr_bank_mem_rsp_valid), + .mem_rsp_addr (curr_bank_mem_rsp_addr), + .mem_rsp_data (curr_bank_mem_rsp_data), + .mem_rsp_ready (curr_bank_mem_rsp_ready), // flush .flush_enable (flush_enable), @@ -351,6 +349,7 @@ module VX_cache #( end VX_cache_core_rsp_merge #( + .CACHE_ID (CACHE_ID), .NUM_BANKS (NUM_BANKS), .NUM_PORTS (NUM_PORTS), .WORD_SIZE (WORD_SIZE), @@ -372,27 +371,27 @@ module VX_cache #( .core_rsp_ready (core_rsp_ready) ); - wire [NUM_BANKS-1:0][(`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; + wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; for (genvar i = 0; i < NUM_BANKS; i++) begin - assign data_in[i] = {per_bank_dram_req_addr[i], per_bank_dram_req_rw[i], per_bank_dram_req_byteen[i], per_bank_dram_req_data[i]}; + assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]}; end VX_stream_arbiter #( .NUM_REQS (NUM_BANKS), - .DATAW (`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), + .DATAW (`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), .BUFFERED (1) - ) dram_req_arb ( + ) mem_req_arb ( .clk (clk), .reset (reset), - .valid_in (per_bank_dram_req_valid), + .valid_in (per_bank_mem_req_valid), .data_in (data_in), - .ready_in (per_bank_dram_req_ready), - .valid_out (dram_req_valid), - .data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}), - .ready_out (dram_req_ready) + .ready_in (per_bank_mem_req_ready), + .valid_out (mem_req_valid), + .data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data}), + .ready_out (mem_req_ready) ); - assign dram_req_tag = dram_req_addr; + assign mem_req_tag = mem_req_addr; `ifdef PERF_ENABLE // per cycle: core_reads, core_writes @@ -420,13 +419,13 @@ module VX_cache #( assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank); assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank); - reg [43:0] perf_core_reads; - reg [43:0] perf_core_writes; - reg [43:0] perf_read_misses; - reg [43:0] perf_write_misses; - reg [43:0] perf_mshr_stalls; - reg [43:0] perf_pipe_stalls; - reg [43:0] perf_crsp_stalls; + reg [`PERF_CTR_BITS-1:0] perf_core_reads; + reg [`PERF_CTR_BITS-1:0] perf_core_writes; + reg [`PERF_CTR_BITS-1:0] perf_read_misses; + reg [`PERF_CTR_BITS-1:0] perf_write_misses; + reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls; + reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls; + reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin if (reset) begin @@ -438,13 +437,13 @@ module VX_cache #( perf_pipe_stalls <= 0; perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle); - perf_read_misses <= perf_read_misses + 44'(perf_read_miss_per_cycle); - perf_write_misses <= perf_write_misses+ 44'(perf_write_miss_per_cycle); - perf_mshr_stalls <= perf_mshr_stalls + 44'(perf_mshr_stall_per_cycle); - perf_pipe_stalls <= perf_pipe_stalls + 44'(perf_pipe_stall_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); + perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle); + perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); + perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index dd75bfe7..0631c5a8 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -1,6 +1,8 @@ `include "VX_cache_define.vh" module VX_cache_core_req_bank_sel #( + parameter CACHE_ID = 0, + // Size of line inside a bank in bytes parameter CACHE_LINE_SIZE = 64, // Size of a word in bytes @@ -22,7 +24,7 @@ module VX_cache_core_req_bank_sel #( input wire reset, `ifdef PERF_ENABLE - output wire [43:0] bank_stalls, + output wire [`PERF_CTR_BITS-1:0] bank_stalls, `endif input wire [NUM_REQS-1:0] core_req_valid, @@ -43,6 +45,7 @@ module VX_cache_core_req_bank_sel #( output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready ); + `UNUSED_PARAM (CACHE_ID) `STATIC_ASSERT (NUM_REQS >= NUM_BANKS, ("invalid number of banks")); `UNUSED_VAR (clk) @@ -148,7 +151,7 @@ module VX_cache_core_req_bank_sel #( end end end - + end else begin always @(*) begin @@ -303,13 +306,13 @@ module VX_cache_core_req_bank_sel #( end end - reg [43:0] bank_stalls_r; + reg [`PERF_CTR_BITS-1:0] bank_stalls_r; always @(posedge clk) begin if (reset) begin bank_stalls_r <= 0; end else begin - bank_stalls_r <= bank_stalls_r + 44'($countones(core_req_sel_r & ~core_req_ready)); + bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'($countones(core_req_sel_r & ~core_req_ready)); end end diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 3a3bf1e1..1254be81 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -1,6 +1,8 @@ `include "VX_cache_define.vh" module VX_cache_core_rsp_merge #( + parameter CACHE_ID = 0, + // Number of Word requests per cycle parameter NUM_REQS = 1, // Number of banks @@ -31,6 +33,8 @@ module VX_cache_core_rsp_merge #( output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready ); + `UNUSED_PARAM (CACHE_ID) + if (NUM_BANKS > 1) begin reg [NUM_REQS-1:0] core_rsp_valid_unqual; @@ -39,6 +43,10 @@ module VX_cache_core_rsp_merge #( if (CORE_TAG_ID_BITS != 0) begin + // The core response bus handles a single tag at the time + // We first need to select the current tag to process, + // then send all bank responses for that tag as a batch + reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; wire core_rsp_ready_unqual; diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 3cd802a1..fcb77efd 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -21,8 +21,8 @@ `define WORDS_PER_LINE (CACHE_LINE_SIZE / WORD_SIZE) `define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE)) -`define DRAM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE)) -`define LINE_ADDR_WIDTH (`DRAM_ADDR_WIDTH-`BANK_SELECT_BITS) +`define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE)) +`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS) // Word select `define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE) @@ -59,11 +59,11 @@ `define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS) -`define DRAM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET] +`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET] -`define DRAM_TO_LINE_ADDR(x) x[`DRAM_ADDR_WIDTH-1 : `BANK_SELECT_BITS] +`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS] -`define LINE_TO_DRAM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)} +`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)} `define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))} diff --git a/hw/rtl/cache/VX_shared_mem.v b/hw/rtl/cache/VX_shared_mem.v index 5241f9d4..44b2f42b 100644 --- a/hw/rtl/cache/VX_shared_mem.v +++ b/hw/rtl/cache/VX_shared_mem.v @@ -4,25 +4,25 @@ module VX_shared_mem #( parameter CACHE_ID = 0, // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = (1024*16), // Number of banks - parameter NUM_BANKS = 4, + parameter NUM_BANKS = 2, // Size of a word in bytes parameter WORD_SIZE = 4, // Number of Word requests per cycle - parameter NUM_REQS = NUM_BANKS, + parameter NUM_REQS = 4, // Core Request Queue Size - parameter CREQ_SIZE = 4, - - // core request tag size - parameter CORE_TAG_WIDTH = 1, + parameter CREQ_SIZE = 8, // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 0, + parameter CORE_TAG_ID_BITS = 8, + + // core request tag size + parameter CORE_TAG_WIDTH = (2 + CORE_TAG_ID_BITS), // bank offset from beginning of index range - parameter BANK_ADDR_OFFSET = 0 + parameter BANK_ADDR_OFFSET = `CLOG2(256) ) ( input wire clk, input wire reset, @@ -54,13 +54,6 @@ module VX_shared_mem #( localparam CACHE_LINE_SIZE = WORD_SIZE; -`ifdef DBG_CACHE_REQ_INFO - /* verilator lint_off UNUSED */ - wire [31:0] debug_pc_st0; - wire [`NW_BITS-1:0] debug_wid_st0; - /* verilator lint_on UNUSED */ -`endif - wire [NUM_BANKS-1:0] per_bank_core_req_valid_unqual; wire [NUM_BANKS-1:0] per_bank_core_req_rw_unqual; wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_unqual; @@ -71,6 +64,7 @@ module VX_shared_mem #( wire per_bank_core_req_ready_unqual; VX_cache_core_req_bank_sel #( + .CACHE_ID (CACHE_ID), .CACHE_LINE_SIZE (WORD_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_PORTS (1), @@ -108,20 +102,26 @@ module VX_shared_mem #( wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr; wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data; - wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag; + wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; wire creq_push, creq_pop, creq_empty, creq_full; - wire crsq_in_ready; + wire crsq_in_fire_last; + + wire [NUM_BANKS-1:0] per_bank_rsp_valid = per_bank_core_req_valid & ~per_bank_core_req_rw; + + wire core_req_has_read = (| per_bank_rsp_valid); - assign creq_push = (| core_req_valid) && !creq_full; - assign creq_pop = ~creq_empty && crsq_in_ready; + assign creq_push = (| core_req_valid) && ~creq_full; + + assign creq_pop = (~creq_empty && ~core_req_has_read) + || crsq_in_fire_last; assign per_bank_core_req_ready_unqual = ~creq_full; - wire [NUM_REQS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual; + wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual; `UNUSED_VAR (per_bank_core_req_addr_unqual) - for (genvar i = 0; i < NUM_REQS; i++) begin + for (genvar i = 0; i < NUM_BANKS; i++) begin assign per_bank_core_req_addr_qual[i] = per_bank_core_req_addr_unqual[i][`LINE_SELECT_BITS-1:0]; end @@ -155,9 +155,14 @@ module VX_shared_mem #( `UNUSED_PIN (size) ); - wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data; + wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data; for (genvar i = 0; i < NUM_BANKS; i++) begin + + wire wren = per_bank_core_req_rw[i] + && per_bank_core_req_valid[i] + && creq_pop; + VX_sp_ram #( .DATAW (`WORD_WIDTH), .SIZE (`LINES_PER_BANK), @@ -166,13 +171,41 @@ module VX_shared_mem #( ) data ( .clk (clk), .addr (per_bank_core_req_addr[i]), - .wren (per_bank_core_req_valid[i] && per_bank_core_req_rw[i]), + .wren (wren), .byteen (per_bank_core_req_byteen[i]), .rden (1'b1), .din (per_bank_core_req_data[i]), .dout (per_bank_core_rsp_data[i]) ); end + + // The core response bus handles a single tag at the time + // We first need to select the current tag to process, + // then send all bank responses for that tag as a batch + + wire crsq_in_valid, crsq_in_ready; + + reg [NUM_BANKS-1:0] bank_rsp_sel, bank_rsp_sel_r; + + wire [NUM_BANKS-1:0] bank_rsp_sel_n = bank_rsp_sel | bank_rsp_sel_r; + + wire crsq_in_fire = crsq_in_valid && crsq_in_ready; + + assign crsq_in_fire_last = crsq_in_fire && (bank_rsp_sel_n == per_bank_rsp_valid); + + always @(posedge clk) begin + if (reset) begin + bank_rsp_sel <= 0; + end else begin + if (crsq_in_fire) begin + if (bank_rsp_sel_n == per_bank_rsp_valid) begin + bank_rsp_sel <= 0; + end else begin + bank_rsp_sel <= bank_rsp_sel_n; + end + end + end + end reg [NUM_REQS-1:0] core_rsp_valids_in; reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in; @@ -180,31 +213,30 @@ module VX_shared_mem #( always @(*) begin core_rsp_valids_in = 0; - core_rsp_data_in = 'x; + core_rsp_data_in = 'x; core_rsp_tag_in = 'x; - for (integer i = 0; i < NUM_BANKS; i++) begin - if (per_bank_core_req_valid[i]) begin - core_rsp_valids_in[per_bank_core_req_tid[i]] = 1; - core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i]; + bank_rsp_sel_r = 0; + + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_rsp_valid[i] && ~bank_rsp_sel[i]) begin core_rsp_tag_in = per_bank_core_req_tag[i]; end end - end - -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st0, debug_wid_st0} = core_rsp_tag_in[`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_pc_st0, debug_wid_st0} = 0; + + for (integer i = 0; i < NUM_BANKS; i++) begin + if (per_bank_core_req_valid[i] + && (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin + core_rsp_valids_in[per_bank_core_req_tid[i]] = 1; + core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i]; + bank_rsp_sel_r[i] = 1; + end + end end -`endif - + wire [NUM_REQS-1:0] core_rsp_valids_out; wire core_rsp_valid_out; - wire core_rsp_rw = | (per_bank_core_req_valid & per_bank_core_req_rw); - - wire crsq_in_valid = ~creq_empty && ~core_rsp_rw; + assign crsq_in_valid = ~creq_empty && core_req_has_read; VX_skid_buffer #( .DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH) @@ -221,16 +253,82 @@ module VX_shared_mem #( assign core_rsp_valid = core_rsp_valids_out & {NUM_REQS{core_rsp_valid_out}}; +`ifdef DBG_CACHE_REQ_INFO +`IGNORE_WARNINGS_BEGIN + wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1; + wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1; +`IGNORE_WARNINGS_END + + for (genvar i = 0; i < NUM_BANKS; ++i) begin + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin + assign {debug_pc_st0[i], debug_wid_st0[i]} = per_bank_core_req_tag_unqual[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS]; + assign {debug_pc_st1[i], debug_wid_st1[i]} = per_bank_core_req_tag[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS]; + end else begin + assign {debug_pc_st0[i], debug_wid_st0[i]} = 0; + assign {debug_pc_st1[i], debug_wid_st1[i]} = 0; + end + end +`endif + `ifdef DBG_PRINT_CACHE_BANK + + reg is_multi_tag_req; +`IGNORE_WARNINGS_BEGIN + reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel; +`IGNORE_WARNINGS_END + + always @(*) begin + core_req_tag_sel ='x; + for (integer i = NUM_BANKS-1; i >= 0; --i) begin + if (per_bank_core_req_valid[i]) begin + core_req_tag_sel = per_bank_core_req_tag[i]; + end + end + is_multi_tag_req = 0; + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_req_valid[i] + && (core_req_tag_sel[CORE_TAG_ID_BITS-1:0] != per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin + is_multi_tag_req = !creq_empty; + end + end + end + always @(posedge clk) begin if (!crsq_in_ready) begin - $display("%t: cache%0d pipeline-stall", $time, CACHE_ID); + $display("%t: *** cache%0d pipeline-stall", $time, CACHE_ID); + end + if (is_multi_tag_req) begin + $display("%t: *** cache%0d multi-tag request!", $time, CACHE_ID); + end + if (creq_push) begin + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_req_valid_unqual[i]) begin + if (per_bank_core_req_rw_unqual[i]) begin + $display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", + $time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], + debug_wid_st0[i], debug_pc_st0[i]); + end else begin + $display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h", + $time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], + debug_wid_st0[i], debug_pc_st0[i]); + end + end + end end if (creq_pop) begin - if (core_rsp_rw) - $display("%t: cache%0d core-wr-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_req_data, debug_wid_st0, debug_pc_st0); - else - $display("%t: cache%0d core-rd-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_rsp_data, debug_wid_st0, debug_pc_st0); + for (integer i = 0; i < NUM_BANKS; ++i) begin + if (per_bank_core_req_valid[i]) begin + if (per_bank_core_req_rw[i]) begin + $display("%t: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", + $time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], + debug_wid_st1[i], debug_pc_st1[i]); + end else begin + $display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h", + $time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], + debug_wid_st1[i], debug_pc_st1[i]); + end + end + end end end `endif @@ -249,9 +347,9 @@ module VX_shared_mem #( assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready); end - reg [43:0] perf_core_reads; - reg [43:0] perf_core_writes; - reg [43:0] perf_crsp_stalls; + reg [`PERF_CTR_BITS-1:0] perf_core_reads; + reg [`PERF_CTR_BITS-1:0] perf_core_writes; + reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin if (reset) begin @@ -259,9 +357,9 @@ module VX_shared_mem #( perf_core_writes <= 0; perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end diff --git a/hw/rtl/interfaces/VX_cache_dram_req_if.v b/hw/rtl/interfaces/VX_cache_dram_req_if.v deleted file mode 100644 index 9bebf8ef..00000000 --- a/hw/rtl/interfaces/VX_cache_dram_req_if.v +++ /dev/null @@ -1,22 +0,0 @@ -`ifndef VX_CACHE_DRAM_REQ_IF -`define VX_CACHE_DRAM_REQ_IF - -`include "../cache/VX_cache_define.vh" - -interface VX_cache_dram_req_if #( - parameter DRAM_LINE_WIDTH = 1, - parameter DRAM_ADDR_WIDTH = 1, - parameter DRAM_TAG_WIDTH = 1 -) (); - - wire valid; - wire rw; - wire [(DRAM_LINE_WIDTH/8)-1:0] byteen; - wire [DRAM_ADDR_WIDTH-1:0] addr; - wire [DRAM_LINE_WIDTH-1:0] data; - wire [DRAM_TAG_WIDTH-1:0] tag; - wire ready; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v b/hw/rtl/interfaces/VX_cache_dram_rsp_if.v deleted file mode 100644 index bae0d29d..00000000 --- a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v +++ /dev/null @@ -1,18 +0,0 @@ -`ifndef VX_CACHE_DRAM_RSP_IF -`define VX_CACHE_DRAM_RSP_IF - -`include "../cache/VX_cache_define.vh" - -interface VX_cache_dram_rsp_if #( - parameter DRAM_LINE_WIDTH = 1, - parameter DRAM_TAG_WIDTH = 1 -) (); - - wire valid; - wire [DRAM_LINE_WIDTH-1:0] data; - wire [DRAM_TAG_WIDTH-1:0] tag; - wire ready; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cache_mem_req_if.v b/hw/rtl/interfaces/VX_cache_mem_req_if.v new file mode 100644 index 00000000..b761b61f --- /dev/null +++ b/hw/rtl/interfaces/VX_cache_mem_req_if.v @@ -0,0 +1,23 @@ +`ifndef VX_CACHE_MEM_REQ_IF +`define VX_CACHE_MEM_REQ_IF + +`include "../cache/VX_cache_config.vh" + +interface VX_cache_mem_req_if #( + parameter MEM_LINE_WIDTH = 1, + parameter MEM_ADDR_WIDTH = 1, + parameter MEM_TAG_WIDTH = 1, + parameter MEM_LINE_SIZE = MEM_LINE_WIDTH / 8 +) (); + + wire valid; + wire rw; + wire [MEM_LINE_SIZE-1:0] byteen; + wire [MEM_ADDR_WIDTH-1:0] addr; + wire [MEM_LINE_WIDTH-1:0] data; + wire [MEM_TAG_WIDTH-1:0] tag; + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cache_mem_rsp_if.v b/hw/rtl/interfaces/VX_cache_mem_rsp_if.v new file mode 100644 index 00000000..eb4abf26 --- /dev/null +++ b/hw/rtl/interfaces/VX_cache_mem_rsp_if.v @@ -0,0 +1,18 @@ +`ifndef VX_CACHE_MEM_RSP_IF +`define VX_CACHE_MEM_RSP_IF + +`include "../cache/VX_cache_config.vh" + +interface VX_cache_mem_rsp_if #( + parameter MEM_LINE_WIDTH = 1, + parameter MEM_TAG_WIDTH = 1 +) (); + + wire valid; + wire [MEM_LINE_WIDTH-1:0] data; + wire [MEM_TAG_WIDTH-1:0] tag; + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_perf_cache_if.v b/hw/rtl/interfaces/VX_perf_cache_if.v index c52352f9..35004368 100644 --- a/hw/rtl/interfaces/VX_perf_cache_if.v +++ b/hw/rtl/interfaces/VX_perf_cache_if.v @@ -5,14 +5,14 @@ interface VX_perf_cache_if (); - wire [43:0] reads; - wire [43:0] writes; - wire [43:0] read_misses; - wire [43:0] write_misses; - wire [43:0] bank_stalls; - wire [43:0] mshr_stalls; - wire [43:0] pipe_stalls; - wire [43:0] crsp_stalls; + wire [`PERF_CTR_BITS-1:0] reads; + wire [`PERF_CTR_BITS-1:0] writes; + wire [`PERF_CTR_BITS-1:0] read_misses; + wire [`PERF_CTR_BITS-1:0] write_misses; + wire [`PERF_CTR_BITS-1:0] bank_stalls; + wire [`PERF_CTR_BITS-1:0] mshr_stalls; + wire [`PERF_CTR_BITS-1:0] pipe_stalls; + wire [`PERF_CTR_BITS-1:0] crsp_stalls; endinterface diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.v b/hw/rtl/interfaces/VX_perf_memsys_if.v index 0cf6b26d..a2ef4835 100644 --- a/hw/rtl/interfaces/VX_perf_memsys_if.v +++ b/hw/rtl/interfaces/VX_perf_memsys_if.v @@ -5,28 +5,28 @@ interface VX_perf_memsys_if (); - wire [43:0] icache_reads; - wire [43:0] icache_read_misses; - wire [43:0] icache_pipe_stalls; - wire [43:0] icache_crsp_stalls; + wire [`PERF_CTR_BITS-1:0] icache_reads; + wire [`PERF_CTR_BITS-1:0] icache_read_misses; + wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls; + wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls; - wire [43:0] dcache_reads; - wire [43:0] dcache_writes; - wire [43:0] dcache_read_misses; - wire [43:0] dcache_write_misses; - wire [43:0] dcache_bank_stalls; - wire [43:0] dcache_mshr_stalls; - wire [43:0] dcache_pipe_stalls; - wire [43:0] dcache_crsp_stalls; + wire [`PERF_CTR_BITS-1:0] dcache_reads; + wire [`PERF_CTR_BITS-1:0] dcache_writes; + wire [`PERF_CTR_BITS-1:0] dcache_read_misses; + wire [`PERF_CTR_BITS-1:0] dcache_write_misses; + wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls; + wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls; + wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls; + wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls; - wire [43:0] smem_reads; - wire [43:0] smem_writes; - wire [43:0] smem_bank_stalls; + wire [`PERF_CTR_BITS-1:0] smem_reads; + wire [`PERF_CTR_BITS-1:0] smem_writes; + wire [`PERF_CTR_BITS-1:0] smem_bank_stalls; - wire [43:0] dram_reads; - wire [43:0] dram_writes; - wire [43:0] dram_stalls; - wire [43:0] dram_latency; + wire [`PERF_CTR_BITS-1:0] mem_reads; + wire [`PERF_CTR_BITS-1:0] mem_writes; + wire [`PERF_CTR_BITS-1:0] mem_stalls; + wire [`PERF_CTR_BITS-1:0] mem_latency; endinterface diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.v b/hw/rtl/interfaces/VX_perf_pipeline_if.v index 25cae8f1..12d76d9c 100644 --- a/hw/rtl/interfaces/VX_perf_pipeline_if.v +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.v @@ -4,14 +4,14 @@ `include "VX_define.vh" interface VX_perf_pipeline_if (); - wire [43:0] ibf_stalls; - wire [43:0] scb_stalls; - wire [43:0] lsu_stalls; - wire [43:0] csr_stalls; - wire [43:0] alu_stalls; - wire [43:0] gpu_stalls; + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] lsu_stalls; + wire [`PERF_CTR_BITS-1:0] csr_stalls; + wire [`PERF_CTR_BITS-1:0] alu_stalls; + wire [`PERF_CTR_BITS-1:0] gpu_stalls; `ifdef EXT_F_ENABLE - wire [43:0] fpu_stalls; + wire [`PERF_CTR_BITS-1:0] fpu_stalls; `endif endinterface diff --git a/hw/rtl/libs/VX_scope.v b/hw/rtl/libs/VX_scope.v index 8cf69211..88045ceb 100644 --- a/hw/rtl/libs/VX_scope.v +++ b/hw/rtl/libs/VX_scope.v @@ -94,13 +94,13 @@ module VX_scope #( delay_val <= $bits(delay_val)'(cmd_data); cmd_start <= 1; `ifdef DBG_PRINT_SCOPE - $display("*** scope:CMD_SET_START: delay_val=%0d", $bits(delay_val)'(cmd_data)); + $display("%t: *** scope: CMD_SET_START: delay_val=%0d", $time, $bits(delay_val)'(cmd_data)); `endif end CMD_SET_STOP: begin waddr_end <= $bits(waddr)'(cmd_data); `ifdef DBG_PRINT_SCOPE - $display("*** scope:CMD_SET_STOP: waddr_end=%0d", $bits(waddr)'(cmd_data)); + $display("%t: *** scope: CMD_SET_STOP: waddr_end=%0d", $time, $bits(waddr)'(cmd_data)); `endif end default:; @@ -117,7 +117,7 @@ module VX_scope #( delay_cntr <= 0; start_time <= timestamp; `ifdef DBG_PRINT_SCOPE - $display("*** scope: recording start - start_time=%0d", timestamp); + $display("%t: *** scope: recording start - start_time=%0d", $time, timestamp); `endif end else begin start_wait <= 1; @@ -133,7 +133,7 @@ module VX_scope #( delta <= 0; start_time <= timestamp; `ifdef DBG_PRINT_SCOPE - $display("*** scope: recording start - start_time=%0d", timestamp); + $display("%t: *** scope: recording start - start_time=%0d", $time, timestamp); `endif end end @@ -162,7 +162,7 @@ module VX_scope #( if (stop || (waddr >= waddr_end)) begin `ifdef DBG_PRINT_SCOPE - $display("*** scope: recording stop - waddr=(%0d, %0d)", waddr, waddr_end); + $display("%t: *** scope: recording stop - waddr=(%0d, %0d)", $time, waddr, waddr_end); `endif waddr <= waddr; // keep last address recording <= 0; diff --git a/hw/scripts/gen_config.py b/hw/scripts/gen_config.py index 9f33c01f..6753f86e 100755 --- a/hw/scripts/gen_config.py +++ b/hw/scripts/gen_config.py @@ -2,6 +2,7 @@ # coding=utf-8 from __future__ import print_function +import sys import os import os.path as path import re @@ -10,55 +11,19 @@ from datetime import datetime script_dir = path.dirname(path.realpath(__file__)) -defines = {} -for k, v in os.environ.items(): - if k.upper().startswith('V_'): - defines[k[2:]] = v - -print('Custom params:', ', '.join(['='.join(x) for x in defines.items()])) - parser = argparse.ArgumentParser() -parser.add_argument('--outc', default='none', help='Output C header') -parser.add_argument('--outv', default='none', help='Output Verilog header') +parser.add_argument('-i', "--input", default='none', help='Verilog header') +parser.add_argument('-o', "--output", default='none', help='C header') args = parser.parse_args() -if args.outc == 'none' and args.outv == 'none': - print('Warning: not emitting any files. Specify arguments') - -if args.outv != 'none': - with open(args.outv, 'w') as f: - print(''' -// auto-generated by gen_config.py. DO NOT EDIT -// Generated at {date} - -`ifndef VX_USER_CONFIG -`define VX_USER_CONFIG -'''[1:].format(date=datetime.now()), file=f) - - for k, v in defines.items(): - print('`define {} {}'.format(k, v), file=f) - - print('\n`endif', file=f) - -if args.outc != 'none': - with open(args.outc, 'w') as f: - print(''' -// auto-generated by gen_config.py. DO NOT EDIT -// Generated at {date} - -#ifndef VX_USER_CONFIG -#define VX_USER_CONFIG -'''[1:].format(date=datetime.now()), file=f) - - for k, v in defines.items(): - print('#define {} {}'.format(k, v), file=f) - - print('\n#endif', file=f) +if args.input == 'none' or args.output == 'none': + print('Error: invalid arguments') + sys.exit() translation_rules = [ # preprocessor directives - (re.compile(r'^\s*`include .*$'), r''), + (re.compile(r'`include\s+.*$'), r''), (re.compile(r'`ifdef'), r'#ifdef'), (re.compile(r'`ifndef'), r'#ifndef'), (re.compile(r'`elif'), r'#elif'), @@ -75,25 +40,24 @@ translation_rules = [ (re.compile(r"\d+'h([\da-fA-F]+)"), r'0x\1') ] -if args.outc != 'none': - with open(args.outc, 'a') as f: - print(''' +with open(args.output, 'w') as f: + print(''' // auto-generated by gen_config.py. DO NOT EDIT // Generated at {date} // Translated from VX_config.vh: '''[1:].format(date=datetime.now()), file=f) - with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r: - lineno = 0 - for line in r: - for pat, repl in translation_rules: - match = pat.search(line) - if match: - line = re.sub(pat, repl, line) - #print("*** match @" + str(lineno) + ": " + match.group() + " => " + line) - f.write(line) - lineno = lineno + 1 - print(''' + with open(args.input, 'r') as r: + lineno = 0 + for line in r: + for pat, repl in translation_rules: + match = pat.search(line) + if match: + line = re.sub(pat, repl, line) + #print("*** match @" + str(lineno) + ": " + match.group() + " => " + line) + f.write(line) + lineno = lineno + 1 + print(''' '''[1:], file=f) diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 7003c846..4e5b0e61 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -97,9 +97,8 @@ "avs_byteenable":64, "avs_burstcount":4, "avs_readdatavalid":1, - "mem_bank_select":1, - "cci_dram_rd_req_ctr":26, - "cci_dram_wr_req_ctr":26, + "cci_mem_rd_req_ctr":26, + "cci_mem_wr_req_ctr":26, "cci_rd_req_ctr":26, "cci_rd_rsp_ctr":3, "cci_wr_req_ctr":26, @@ -110,23 +109,23 @@ "!cci_pending_reads_full":1, "!cci_pending_writes_empty":1, "!cci_pending_writes_full": 1, - "?afu_dram_req_fire": 1, - "afu_dram_req_addr": 26, - "afu_dram_req_tag": 28, - "?afu_dram_rsp_fire": 1, - "afu_dram_rsp_tag": 28 + "?afu_mem_req_fire": 1, + "afu_mem_req_addr": 26, + "afu_mem_req_tag": 28, + "?afu_mem_rsp_fire": 1, + "afu_mem_rsp_tag": 28 }, "afu/vortex": { "!reset": 1, - "?dram_req_fire": 1, - "dram_req_addr": 32, - "dram_req_rw": 1, - "dram_req_byteen":"`VX_DRAM_BYTEEN_WIDTH", - "dram_req_data":"`VX_DRAM_LINE_WIDTH", - "dram_req_tag":"`VX_DRAM_TAG_WIDTH", - "?dram_rsp_fire": 1, - "dram_rsp_data":"`VX_DRAM_LINE_WIDTH", - "dram_rsp_tag":"`VX_DRAM_TAG_WIDTH", + "?mem_req_fire": 1, + "mem_req_addr": 32, + "mem_req_rw": 1, + "mem_req_byteen":"`VX_MEM_BYTEEN_WIDTH", + "mem_req_data":"`VX_MEM_LINE_WIDTH", + "mem_req_tag":"`VX_MEM_TAG_WIDTH", + "?mem_rsp_fire": 1, + "mem_rsp_data":"`VX_MEM_LINE_WIDTH", + "mem_rsp_tag":"`VX_MEM_TAG_WIDTH", "busy": 1 }, "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { @@ -207,7 +206,7 @@ "force_miss_st0": 1, "mshr_push": 1, "?crsq_in_stall": 1, - "?dreq_alm_full": 1, + "?mreq_alm_full": 1, "?mshr_alm_full": 1 } } diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 82422c27..2b10421d 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -1,7 +1,7 @@ CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors -CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized +CFLAGS += -Wno-maybe-uninitialized CFLAGS += -I../.. @@ -13,7 +13,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM +DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE @@ -22,11 +22,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO -SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 +SINGLECORE = -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -#MULTICORE ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 -#MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 +#MULTICORE = -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 +#MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 +MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 SINGLECORE += $(CONFIGS) MULTICORE += $(CONFIGS) @@ -44,15 +44,16 @@ SRCS = simulator.cpp testbench.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic -VL_FLAGS += -Wno-DECLFILENAME +VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += verilator.vlt VL_FLAGS += --exe $(SRCS) $(RTL_INCLUDE) VL_FLAGS += --cc Vortex.v --top-module $(TOP) -# Use FPNEW PFU core -VL_FLAGS += -DFPU_FPNEW +# FPU backend +FPU_CORE ?= FPU_FPNEW +VL_FLAGS += -D$(FPU_CORE) DBG_FLAGS += -DVCD_OUTPUT diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 2a4d8a1a..25ed1992 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -5,10 +5,23 @@ #define RESET_DELAY 4 -#define ENABLE_DRAM_STALLS -#define DRAM_LATENCY 24 -#define DRAM_RQ_SIZE 16 -#define DRAM_STALLS_MODULO 16 +#define ENABLE_MEM_STALLS + +#ifndef MEM_LATENCY +#define MEM_LATENCY 24 +#endif + +#ifndef MEM_RQ_SIZE +#define MEM_RQ_SIZE 16 +#endif + +#ifndef MEM_STALLS_MODULO +#define MEM_STALLS_MODULO 16 +#endif + +#ifndef VERILATOR_RESET_VALUE +#define VERILATOR_RESET_VALUE 2 +#endif #define VL_WDATA_GETW(lwp, i, n, w) \ VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w) @@ -21,7 +34,7 @@ double sc_time_stamp() { Simulator::Simulator() { // force random values for unitialized signals - Verilated::randReset(2); + Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); // Turn off assertion before reset @@ -56,19 +69,19 @@ Simulator::~Simulator() { void Simulator::attach_ram(RAM* ram) { ram_ = ram; - dram_rsp_vec_.clear(); + mem_rsp_vec_.clear(); } void Simulator::reset() { print_bufs_.clear(); - dram_rsp_vec_.clear(); + mem_rsp_vec_.clear(); - dram_rsp_active_ = false; + mem_rsp_active_ = false; csr_req_active_ = false; csr_rsp_value_ = nullptr; - vortex_->dram_rsp_valid = 0; - vortex_->dram_req_ready = 0; + vortex_->mem_rsp_valid = 0; + vortex_->mem_req_ready = 0; //vortex_->io_req_ready = 0; //vortex_->io_rsp_valid = 0; vortex_->csr_req_valid = 0; @@ -94,13 +107,13 @@ void Simulator::step() { vortex_->clk = 0; this->eval(); - dram_rsp_ready_ = vortex_->dram_rsp_ready; + mem_rsp_ready_ = vortex_->mem_rsp_ready; csr_req_ready_ = vortex_->csr_req_ready; vortex_->clk = 1; this->eval(); - this->eval_dram_bus(); + this->eval_mem_bus(); this->eval_io_bus(); this->eval_csr_bus(); @@ -117,83 +130,83 @@ void Simulator::eval() { ++timestamp; } -void Simulator::eval_dram_bus() { +void Simulator::eval_mem_bus() { if (ram_ == nullptr) { - vortex_->dram_req_ready = 0; + vortex_->mem_req_ready = 0; return; } - // update DRAM responses schedule - for (auto& rsp : dram_rsp_vec_) { + // update memory responses schedule + for (auto& rsp : mem_rsp_vec_) { if (rsp.cycles_left > 0) rsp.cycles_left -= 1; } - // schedule DRAM responses in FIFO order - std::list::iterator dram_rsp_it(dram_rsp_vec_.end()); - if (!dram_rsp_vec_.empty() - && (0 == dram_rsp_vec_.begin()->cycles_left)) { - dram_rsp_it = dram_rsp_vec_.begin(); + // schedule memory responses in FIFO order + std::list::iterator mem_rsp_it(mem_rsp_vec_.end()); + if (!mem_rsp_vec_.empty() + && (0 == mem_rsp_vec_.begin()->cycles_left)) { + mem_rsp_it = mem_rsp_vec_.begin(); } - // send DRAM response - if (dram_rsp_active_ - && vortex_->dram_rsp_valid && dram_rsp_ready_) { - dram_rsp_active_ = false; + // send memory response + if (mem_rsp_active_ + && vortex_->mem_rsp_valid && mem_rsp_ready_) { + mem_rsp_active_ = false; } - if (!dram_rsp_active_) { - if (dram_rsp_it != dram_rsp_vec_.end()) { - vortex_->dram_rsp_valid = 1; - memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_it->block.data(), GLOBAL_BLOCK_SIZE); - vortex_->dram_rsp_tag = dram_rsp_it->tag; - dram_rsp_vec_.erase(dram_rsp_it); - dram_rsp_active_ = true; + if (!mem_rsp_active_) { + if (mem_rsp_it != mem_rsp_vec_.end()) { + vortex_->mem_rsp_valid = 1; + memcpy((uint8_t*)vortex_->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE); + vortex_->mem_rsp_tag = mem_rsp_it->tag; + mem_rsp_vec_.erase(mem_rsp_it); + mem_rsp_active_ = true; } else { - vortex_->dram_rsp_valid = 0; + vortex_->mem_rsp_valid = 0; } } - // handle DRAM stalls - bool dram_stalled = false; -#ifdef ENABLE_DRAM_STALLS - if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) { - dram_stalled = true; + // handle memory stalls + bool mem_stalled = false; +#ifdef ENABLE_MEM_STALLS + if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { + mem_stalled = true; } else - if (dram_rsp_vec_.size() >= DRAM_RQ_SIZE) { - dram_stalled = true; + if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) { + mem_stalled = true; } #endif - // process DRAM requests - if (!dram_stalled) { - if (vortex_->dram_req_valid) { - if (vortex_->dram_req_rw) { - uint64_t byteen = vortex_->dram_req_byteen; - unsigned base_addr = (vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE); - uint8_t* data = (uint8_t*)(vortex_->dram_req_data); - for (int i = 0; i < GLOBAL_BLOCK_SIZE; i++) { + // process memory requests + if (!mem_stalled) { + if (vortex_->mem_req_valid) { + if (vortex_->mem_req_rw) { + uint64_t byteen = vortex_->mem_req_byteen; + unsigned base_addr = (vortex_->mem_req_addr * MEM_BLOCK_SIZE); + uint8_t* data = (uint8_t*)(vortex_->mem_req_data); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { (*ram_)[base_addr + i] = data[i]; } } } else { - dram_req_t dram_req; - dram_req.tag = vortex_->dram_req_tag; - dram_req.addr = vortex_->dram_req_addr; - ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.block.data()); - dram_req.cycles_left = DRAM_LATENCY; - for (auto& rsp : dram_rsp_vec_) { - if (dram_req.addr == rsp.addr) { - dram_req.cycles_left = rsp.cycles_left; + mem_req_t mem_req; + mem_req.tag = vortex_->mem_req_tag; + mem_req.addr = vortex_->mem_req_addr; + ram_->read(vortex_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.block.data()); + mem_req.cycles_left = MEM_LATENCY; + for (auto& rsp : mem_rsp_vec_) { + if (mem_req.addr == rsp.addr) { + mem_req.cycles_left = rsp.cycles_left; break; } } - dram_rsp_vec_.emplace_back(dram_req); + mem_rsp_vec_.emplace_back(mem_req); } } } - vortex_->dram_req_ready = !dram_stalled; + vortex_->mem_req_ready = !mem_stalled; } void Simulator::eval_io_bus() { diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 626474bd..80e2fa42 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -48,23 +48,23 @@ private: typedef struct { int cycles_left; - std::array block; + std::array block; uint32_t addr; uint32_t tag; - } dram_req_t; + } mem_req_t; std::unordered_map print_bufs_; void eval(); - void eval_dram_bus(); + void eval_mem_bus(); void eval_io_bus(); void eval_csr_bus(); - std::list dram_rsp_vec_; - bool dram_rsp_active_; + std::list mem_rsp_vec_; + bool mem_rsp_active_; - bool dram_rsp_ready_; + bool mem_rsp_ready_; bool csr_req_ready_; bool csr_req_active_; uint32_t* csr_rsp_value_; diff --git a/hw/modelsim/Makefile b/hw/syn/modelsim/Makefile similarity index 100% rename from hw/modelsim/Makefile rename to hw/syn/modelsim/Makefile diff --git a/hw/modelsim/cshrc.modelsim b/hw/syn/modelsim/cshrc.modelsim similarity index 100% rename from hw/modelsim/cshrc.modelsim rename to hw/syn/modelsim/cshrc.modelsim diff --git a/hw/modelsim/vortex_dpi.cpp b/hw/syn/modelsim/vortex_dpi.cpp similarity index 100% rename from hw/modelsim/vortex_dpi.cpp rename to hw/syn/modelsim/vortex_dpi.cpp diff --git a/hw/modelsim/vortex_dpi.h b/hw/syn/modelsim/vortex_dpi.h similarity index 100% rename from hw/modelsim/vortex_dpi.h rename to hw/syn/modelsim/vortex_dpi.h diff --git a/hw/modelsim/vortex_tb.v b/hw/syn/modelsim/vortex_tb.v similarity index 100% rename from hw/modelsim/vortex_tb.v rename to hw/syn/modelsim/vortex_tb.v diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 5a871248..d820df9a 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -1,32 +1,114 @@ -ASE_BUILD_DIR=build_ase -FPGA_BUILD_DIR=build_fpga +DEVICE_FAMILY ?= arria10 +ASE_BUILD_DIR ?= build_ase_$(DEVICE_FAMILY) +FPGA_BUILD_DIR ?= build_fpga_$(DEVICE_FAMILY) RTL_DIR=../../rtl -ifeq (, $(shell which qsub-synth)) +ifeq ($(shell which qsub-synth),) RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 & else RUN_SYNTH=qsub-synth endif +# control RTL debug print states +DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE +DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE +DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG +DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA +DBG_PRINT_FLAGS += -DDBG_PRINT_MEM +DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE +DBG_PRINT_FLAGS += -DDBG_PRINT_AVS +DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE + +DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += -DDBG_CACHE_REQ_INFO + +CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG4 := -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG8 := -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG16 := -DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) +CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) + +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY) +RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu + +CFLAGS += $(RTL_INCLUDE) + +# Debugigng +ifdef DEBUG + CFLAGS += $(DBG_FLAGS) +else + CFLAGS += -DNDEBUG +endif + +# Enable scope analyzer +ifdef SCOPE + CFLAGS += -DSCOPE +endif + +# Enable perf counters +ifdef PERF + CFLAGS += -DPERF_ENABLE +endif + all: ase-1c -gen_sources_a10: - ./gen_sources.sh arria10 > sources.txt +$(ASE_BUILD_DIR)_1c/Makefile: + afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_1c -gen_sources_s10: - ./gen_sources.sh stratix10 > sources.txt +$(ASE_BUILD_DIR)_2c/Makefile: + afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_2c -ase-1c: gen_sources_a10 setup-ase-1c - make -C $(ASE_BUILD_DIR)_1c - cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_1c/work +$(ASE_BUILD_DIR)_4c/Makefile: + afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_4c -ase-2c: gen_sources_a10 setup-ase-2c - make -C $(ASE_BUILD_DIR)_2c - cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_2c/work +$(FPGA_BUILD_DIR)_1c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_1c -ase-4c: gen_sources_a10 setup-ase-4c - make -C $(ASE_BUILD_DIR)_4c - cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_4c/work +$(FPGA_BUILD_DIR)_2c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_2c + +$(FPGA_BUILD_DIR)_4c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_4c + +$(FPGA_BUILD_DIR)_8c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_8c + +$(FPGA_BUILD_DIR)_16c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_16c + +$(FPGA_BUILD_DIR)_32c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_32c + +$(FPGA_BUILD_DIR)_64c/build/dcp.qpf: + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_64c + +gen-sources-1c: + ./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt + +gen-sources-2c: + ./gen_sources.sh $(CFLAGS) $(CONFIG2) > sources.txt + +gen-sources-4c: + ./gen_sources.sh $(CFLAGS) $(CONFIG4) > sources.txt + +gen-sources-8c: + ./gen_sources.sh $(CFLAGS) $(CONFIG8) > sources.txt + +gen-sources-16c: + ./gen_sources.sh $(CFLAGS) $(CONFIG16) > sources.txt + +gen-sources-32c: + ./gen_sources.sh $(CFLAGS) $(CONFIG32) > sources.txt + +gen-sources-64c: + ./gen_sources.sh $(CFLAGS) $(CONFIG64) > sources.txt + +# setup setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile @@ -34,36 +116,6 @@ setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile -$(ASE_BUILD_DIR)_1c/Makefile: - afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c - -$(ASE_BUILD_DIR)_2c/Makefile: - afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c - -$(ASE_BUILD_DIR)_4c/Makefile: - afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c - -fpga-1c: gen_sources_a10 setup-fpga-1c - cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH) - -fpga-2c: gen_sources_a10 setup-fpga-2c - cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH) - -fpga-4c: gen_sources_a10 setup-fpga-4c - cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH) - -fpga-8c: gen_sources_a10 setup-fpga-8c - cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH) - -fpga-16c: gen_sources_a10 setup-fpga-16c - cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH) - -fpga-32c: gen_sources_s10 setup-fpga-32c - cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH) - -fpga-64c: gen_sources_s10 setup-fpga-64c - cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH) - setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf @@ -78,35 +130,42 @@ setup-fpga-32c: $(FPGA_BUILD_DIR)_32c/build/dcp.qpf setup-fpga-64c: $(FPGA_BUILD_DIR)_64c/build/dcp.qpf -$(FPGA_BUILD_DIR)_1c/build/dcp.qpf: - afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c +# build -$(FPGA_BUILD_DIR)_2c/build/dcp.qpf: - afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c +ase-1c: gen-sources-1c setup-ase-1c + make -C $(ASE_BUILD_DIR)_1c + cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_1c/work -$(FPGA_BUILD_DIR)_4c/build/dcp.qpf: - afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c +ase-2c: gen-sources-2c setup-ase-2c + make -C $(ASE_BUILD_DIR)_2c + cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_2c/work -$(FPGA_BUILD_DIR)_8c/build/dcp.qpf: - afu_synth_setup -s sources_8c.txt $(FPGA_BUILD_DIR)_8c +ase-4c: gen-sources-4c setup-ase-4c + make -C $(ASE_BUILD_DIR)_4c + cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_4c/work -$(FPGA_BUILD_DIR)_16c/build/dcp.qpf: - afu_synth_setup -s sources_16c.txt $(FPGA_BUILD_DIR)_16c +fpga-1c: gen-sources-1c setup-fpga-1c + cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH) -$(FPGA_BUILD_DIR)_32c/build/dcp.qpf: - afu_synth_setup -s sources_32c.txt $(FPGA_BUILD_DIR)_32c +fpga-2c: gen-sources-2c setup-fpga-2c + cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH) -$(FPGA_BUILD_DIR)_64c/build/dcp.qpf: - afu_synth_setup -s sources_64c.txt $(FPGA_BUILD_DIR)_64c +fpga-4c: gen-sources-4c setup-fpga-4c + cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH) -run-ase-1c: - cd $(ASE_BUILD_DIR)_1c && make sim +fpga-8c: gen-sources-8c setup-fpga-8c + cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH) -run-ase-2c: - cd $(ASE_BUILD_DIR)_2c && make sim +fpga-16c: gen-sources-16c setup-fpga-16c + cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH) -run-ase-4c: - cd $(ASE_BUILD_DIR)_4c && make sim +fpga-32c: gen-sources-32c setup-fpga-32c + cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH) + +fpga-64c: gen-sources-64c setup-fpga-64c + cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH) + +# cleanup clean-ase-1c: rm -rf $(ASE_BUILD_DIR)_1c sources.txt diff --git a/hw/syn/opae/README b/hw/syn/opae/README index ade3474b..1c61ca16 100644 --- a/hw/syn/opae/README +++ b/hw/syn/opae/README @@ -44,6 +44,9 @@ fpgaconf vortex_afu.gbs # If this says Multiple ports. Then use --bus with fpgaconf. #bus info can be found by fpgainfo port fpgaconf --bus 0xaf vortex_afu.gbs +# get portid +fpgainfo port + # Running the Test case cd /driver/tests/basic make run-fpga @@ -60,11 +63,13 @@ qsub-sim make ase # tests -./run_ase.sh build_ase_1c ../../../driver/tests/basic/basic -n16 -./run_ase.sh build_ase_1c ../../../driver/tests/demo/demo -n16 -./run_ase.sh build_ase_1c ../../../driver/tests/dogfood/dogfood -n16 -./run_ase.sh build_ase_1c ../../../benchmarks/opencl/vecadd/vecadd -./run_ase.sh build_ase_1c ../../../benchmarks/opencl/sgemm/sgemm -n4 +./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t0 +./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t1 +./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n16 +./run_ase.sh build_ase_arria10_1c ../../../driver/tests/demo/demo -n16 +./run_ase.sh build_ase_arria10_1c ../../../driver/tests/dogfood/dogfood -n16 +./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/vecadd/vecadd +./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/sgemm/sgemm -n4 # modify "vsim_run.tcl" to dump VCD trace vcd file trace.vcd @@ -75,17 +80,10 @@ run -all tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt -o -iname \*.txt -o -iname \*summary -o -iname \*.log \)` # compress VCD trace -tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd -tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd -tar -zcvf trace.fst.tar.gz trace.fst run.log tar -zcvf run.log.tar.gz run.log -tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd -tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd -tar -zcvf run.log.tar.gz build_ase_1c/work/run.log -tar -zcvf vx_scope.vcd.tar.gz vx_scope.vcd tar -cvjf vx_scope.vcd.tar.bz2 vx_scope.vcd -tar -cvjf trace.fst.tar.bz2 trace.fst run.log tar -cvjf trace.vcd.tar.bz2 trace.vcd run.log +tar -cvjf trace.vcd.tar.bz2 build_ase_arria10_1c/work/run.log build_ase_arria10_1c/work/trace.vcd # decompress VCD trace tar -zxvf vortex.vcd.tar.gz @@ -95,15 +93,4 @@ tar -xvf vortex.vcd.tar.bz2 lsof +D build_ase_1c # quick off synthesis -make -C unittest clean && make -C unittest > unittest/build.log 2>&1 & -make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 & -make -C cache clean && make -C cache > cache/build.log 2>&1 & -make -C core clean && make -C core > core/build.log 2>&1 & -make -C vortex clean && make -C vortex > vortex/build.log 2>&1 & -make -C top1 clean && make -C top1 > top1/build.log 2>&1 & -make -C top2 clean && make -C top2 > top2/build.log 2>&1 & -make -C top4 clean && make -C top4 > top4/build.log 2>&1 & -make -C top8 clean && make -C top8 > top8/build.log 2>&1 & -make -C top16 clean && make -C top16 > top16/build.log 2>&1 & -make -C top32 clean && make -C top32 > top32/build.log 2>&1 & -make -C top64 clean && make -C top64 > top64/build.log 2>&1 & \ No newline at end of file +make core \ No newline at end of file diff --git a/hw/syn/opae/gen_sources.sh b/hw/syn/opae/gen_sources.sh index 03429f1f..b330efc1 100755 --- a/hw/syn/opae/gen_sources.sh +++ b/hw/syn/opae/gen_sources.sh @@ -1,39 +1,46 @@ #!/bin/bash -rtl_dir="../../rtl" exclude_list="VX_fpu_fpnew.v" -file_list="" +macros=() +includes=() -add_dirs() -{ - for dir in $*; do - echo "+incdir+$dir" - for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do - exclude=0 - for fe in $exclude_list; do - if [[ $file =~ $fe ]]; then - exclude=1 - fi - done - if [[ $exclude == 0 ]]; then - file_list="$file_list $file" +# parse command arguments +while getopts D:I:h flag +do + case "${flag}" in + D) macros+=( ${OPTARG} );; + I) includes+=( ${OPTARG} );; + h) echo "Usage: [-D macro] [-I include] [-h help]" + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" 1>&2 + exit 1 + ;; + esac +done + +# dump macros +for value in ${macros[@]}; do + echo "+define+$value" +done + +# dump include directories +for dir in ${includes[@]}; do + echo "+incdir+$dir" +done + +# dump source files +for dir in ${includes[@]}; do + for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do + exclude=0 + for fe in $exclude_list; do + if [[ $file =~ $fe ]]; then + exclude=1 fi done + if [[ $exclude == 0 ]]; then + echo $file + fi done -} - -add_files() -{ - for file in $*; do - file_list="$file_list $file" - done -} - -add_dirs $rtl_dir/fp_cores/altera/$1 - -add_dirs $rtl_dir/libs $rtl_dir/interfaces $rtl_dir/fp_cores $rtl_dir/cache $rtl_dir/tex_unit $rtl_dir $rtl_dir/afu - -# dump file list -for file in $file_list; do - echo $file done \ No newline at end of file diff --git a/hw/syn/opae/sources_2c.txt b/hw/syn/opae/setup.cfg similarity index 66% rename from hw/syn/opae/sources_2c.txt rename to hw/syn/opae/setup.cfg index a70589f5..9bb0b72d 100644 --- a/hw/syn/opae/sources_2c.txt +++ b/hw/syn/opae/setup.cfg @@ -1,8 +1,5 @@ -+define+NUM_CORES=2 - +define+SYNTHESIS +define+QUARTUS -#+define+PERF_ENABLE vortex_afu.json QI:vortex_afu.qsf diff --git a/hw/syn/opae/sources_16c.txt b/hw/syn/opae/sources_16c.txt deleted file mode 100644 index cbee87e0..00000000 --- a/hw/syn/opae/sources_16c.txt +++ /dev/null @@ -1,12 +0,0 @@ -+define+NUM_CORES=4 -+define+NUM_CLUSTERS=4 -#+define+L3_ENABLE=1 - -+define+SYNTHESIS -+define+QUARTUS -#+define+PERF_ENABLE - -vortex_afu16.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_1c.txt b/hw/syn/opae/sources_1c.txt deleted file mode 100644 index a429a492..00000000 --- a/hw/syn/opae/sources_1c.txt +++ /dev/null @@ -1,24 +0,0 @@ -+define+NUM_CORES=1 - -+define+SYNTHESIS -+define+QUARTUS -#+define+SCOPE -#+define+PERF_ENABLE - -#+define+DBG_PRINT_CORE_ICACHE -#+define+DBG_PRINT_CORE_DCACHE -#+define+DBG_PRINT_CACHE_BANK -#+define+DBG_PRINT_CACHE_MSHR -#+define+DBG_PRINT_CACHE_TAG -#+define+DBG_PRINT_CACHE_DATA -#+define+DBG_PRINT_DRAM -#+define+DBG_PRINT_PIPELINE -#+define+DBG_PRINT_OPAE -#+define+DBG_PRINT_AVS -#+define+DBG_PRINT_SCOPE -#+define+DBG_CACHE_REQ_INFO - -vortex_afu.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_32c.txt b/hw/syn/opae/sources_32c.txt deleted file mode 100644 index 1fc88ecd..00000000 --- a/hw/syn/opae/sources_32c.txt +++ /dev/null @@ -1,14 +0,0 @@ -+define+NUM_CORES=8 -+define+NUM_CLUSTERS=4 -#+define+L3_ENABLE=1 - -+define+GLOBAL_BLOCK_SIZE=16 - -+define+SYNTHESIS -+define+QUARTUS -#+define+PERF_ENABLE - -vortex_afu.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_4c.txt b/hw/syn/opae/sources_4c.txt deleted file mode 100644 index 9ac95cdd..00000000 --- a/hw/syn/opae/sources_4c.txt +++ /dev/null @@ -1,10 +0,0 @@ -+define+NUM_CORES=4 - -+define+SYNTHESIS -+define+QUARTUS -#+define+PERF_ENABLE - -vortex_afu.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_64c.txt b/hw/syn/opae/sources_64c.txt deleted file mode 100644 index bf267717..00000000 --- a/hw/syn/opae/sources_64c.txt +++ /dev/null @@ -1,14 +0,0 @@ -+define+NUM_CORES=8 -+define+NUM_CLUSTERS=8 -#+define+L3_ENABLE=1 - -+define+GLOBAL_BLOCK_SIZE=16 - -+define+SYNTHESIS -+define+QUARTUS -#+define+PERF_ENABLE - -vortex_afu.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_8c.txt b/hw/syn/opae/sources_8c.txt deleted file mode 100644 index baafe36a..00000000 --- a/hw/syn/opae/sources_8c.txt +++ /dev/null @@ -1,12 +0,0 @@ -+define+NUM_CORES=4 -+define+NUM_CLUSTERS=2 -#+define+L3_ENABLE=1 - -+define+SYNTHESIS -+define+QUARTUS -#+define+PERF_ENABLE - -vortex_afu8.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/quartus/.gitignore b/hw/syn/quartus/.gitignore index 0c2cba5a..7a0867fe 100644 --- a/hw/syn/quartus/.gitignore +++ b/hw/syn/quartus/.gitignore @@ -1,6 +1,9 @@ /unittest/* !/unittest/Makefile +/smem/* +!/smem/Makefile + /cache/* !/cache/Makefile diff --git a/hw/syn/quartus/Makefile b/hw/syn/quartus/Makefile index 66d95034..9cf2a79b 100644 --- a/hw/syn/quartus/Makefile +++ b/hw/syn/quartus/Makefile @@ -1,37 +1,68 @@ -.PHONY: unittest pipeline cache core vortex top1 top2 top4 top8 top16 top32 top64 +BUILDIR ?= build + +.PHONY: unittest pipeline smem cache core vortex top1 top2 top4 top8 top16 top32 top64 unittest: - $(MAKE) -C unittest clean && $(MAKE) -C unittest > unittest/build.log 2>&1 & + mkdir -p unittest/$(BUILDIR) + cp unittest/Makefile unittest/$(BUILDIR) + $(MAKE) -C unittest/$(BUILDIR) clean && $(MAKE) -C unittest/$(BUILDIR) > unittest//$(BUILDIR)build.log 2>&1 & pipeline: - $(MAKE) -C pipeline clean && $(MAKE) -C pipeline > pipeline/build.log 2>&1 & + mkdir -p pipeline/$(BUILDIR) + cp pipeline/Makefile pipeline/$(BUILDIR) + $(MAKE) -C pipeline/$(BUILDIR) clean && $(MAKE) -C pipeline/$(BUILDIR) > pipeline/$(BUILDIR)/build.log 2>&1 & + +smem: + mkdir -p smem/$(BUILDIR) + cp smem/Makefile smem/$(BUILDIR) + $(MAKE) -C smem/$(BUILDIR) clean && $(MAKE) -C smem/$(BUILDIR) > smem/$(BUILDIR)/build.log 2>&1 & cache: - $(MAKE) -C cache clean && $(MAKE) -C cache > cache/build.log 2>&1 & + mkdir -p cache/$(BUILDIR) + cp cache/Makefile cache/$(BUILDIR) + $(MAKE) -C cache/$(BUILDIR) clean && $(MAKE) -C cache/$(BUILDIR) > cache/$(BUILDIR)/build.log 2>&1 & core: - $(MAKE) -C core clean && $(MAKE) -C core > core/build.log 2>&1 & + mkdir -p core/$(BUILDIR) + cp core/Makefile core/$(BUILDIR) + $(MAKE) -C core/$(BUILDIR) clean && $(MAKE) -C core/$(BUILDIR) > core/$(BUILDIR)/build.log 2>&1 & vortex: - $(MAKE) -C vortex clean && $(MAKE) -C vortex > vortex/build.log 2>&1 & + mkdir -p vortex/$(BUILDIR) + cp vortex/Makefile vortex/$(BUILDIR) + $(MAKE) -C vortex/$(BUILDIR) clean && $(MAKE) -C vortex/$(BUILDIR) > vortex/$(BUILDIR)/build.log 2>&1 & top1: - $(MAKE) -C top1 clean && $(MAKE) -C top1 > top1/build.log 2>&1 & + mkdir -p top1/$(BUILDIR) + cp top1/Makefile top1/$(BUILDIR) + $(MAKE) -C top1/$(BUILDIR) clean && $(MAKE) -C top1/$(BUILDIR) > top1/$(BUILDIR)/build.log 2>&1 & top2: - $(MAKE) -C top2 clean && $(MAKE) -C top2 > top2/build.log 2>&1 & + mkdir -p top2/$(BUILDIR) + cp top2/Makefile top2/$(BUILDIR) + $(MAKE) -C top2/$(BUILDIR) clean && $(MAKE) -C top2/$(BUILDIR) > top2/$(BUILDIR)/build.log 2>&1 & top4: - $(MAKE) -C top4 clean && $(MAKE) -C top4 > top4/build.log 2>&1 & + mkdir -p top4/$(BUILDIR) + cp top4/Makefile top4/$(BUILDIR) + $(MAKE) -C top4/$(BUILDIR) clean && $(MAKE) -C top4/$(BUILDIR) > top4/$(BUILDIR)/build.log 2>&1 & top8: - $(MAKE) -C top8 clean && $(MAKE) -C top8 > top8/build.log 2>&1 & + mkdir -p top8/$(BUILDIR) + cp top8/Makefile top8/$(BUILDIR) + $(MAKE) -C top8/$(BUILDIR) clean && $(MAKE) -C top8/$(BUILDIR) > top8/$(BUILDIR)/build.log 2>&1 & top16: - $(MAKE) -C top16 clean && $(MAKE) -C top16 > top16/build.log 2>&1 & + mkdir -p top16/$(BUILDIR) + cp top16/Makefile top16/$(BUILDIR) + $(MAKE) -C top16/$(BUILDIR) clean && $(MAKE) -C top16/$(BUILDIR) > top16/$(BUILDIR)build.log 2>&1 & top32: - $(MAKE) -C top32 clean && $(MAKE) -C top32 > top32/build.log 2>&1 & + mkdir -p top32/$(BUILDIR) + cp top32/Makefile top32/$(BUILDIR) + $(MAKE) -C top32/$(BUILDIR) clean && $(MAKE) -C top32/$(BUILDIR) > top32/$(BUILDIR)/build.log 2>&1 & top64: - $(MAKE) -C top64 clean && $(MAKE) -C top64 > top64/build.log 2>&1 & \ No newline at end of file + mkdir -p top64/$(BUILDIR) + cp top64/Makefile top64/$(BUILDIR) + $(MAKE) -C top64/$(BUILDIR) clean && $(MAKE) -C top64/$(BUILDIR) > top64/$(BUILDIR)/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/quartus/cache/Makefile b/hw/syn/quartus/cache/Makefile index 34ffd29c..d28d9b18 100755 --- a/hw/syn/quartus/cache/Makefile +++ b/hw/syn/quartus/cache/Makefile @@ -1,14 +1,12 @@ -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG - PROJECT = VX_cache TOP_LEVEL_ENTITY = VX_cache SRC_FILE = VX_cache.v +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG -RTL_DIR=../../../rtl RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache - PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration @@ -53,7 +51,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index 8c7132fe..b976110c 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -1,13 +1,17 @@ -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG - PROJECT = Core TOP_LEVEL_ENTITY = VX_core SRC_FILE = VX_core.v +RTL_DIR = ../../../../rtl -RTL_DIR=../../../rtl -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(RTL_DIR)/fp_cores/altera/arria10;$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 + +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -55,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index 84b5889f..aee4b18e 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -1,22 +1,22 @@ -# Part, Family +PROJECT = VX_pipeline +TOP_LEVEL_ENTITY = VX_pipeline +SRC_FILE = VX_pipeline.v +RTL_DIR = ../../../rtl + FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 -PROJECT = Core -TOP_LEVEL_ENTITY = VX_core -SRC_FILE = VX_core.v +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(RTL_DIR)/fp_cores/altera/arria10;$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG - # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" syn.chg: $(STAMP) syn.chg @@ -74,7 +74,7 @@ asm.chg: $(STAMP) asm.chg timing: $(PROJECT_FILES) - quartus_sh -t ../timing-html.tcl -project $(PROJECT) + quartus_sh -t ../../timing-html.tcl -project $(PROJECT) program: $(PROJECT).sof quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" diff --git a/hw/syn/quartus/smem/Makefile b/hw/syn/quartus/smem/Makefile new file mode 100755 index 00000000..3b0c0872 --- /dev/null +++ b/hw/syn/quartus/smem/Makefile @@ -0,0 +1,72 @@ +PROJECT = VX_shared_mem +TOP_LEVEL_ENTITY = VX_shared_mem +SRC_FILE = VX_shared_mem.v +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG + +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile index db46de40..3fcd8447 100644 --- a/hw/syn/quartus/top1/Makefile +++ b/hw/syn/quartus/top1/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=1" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=1" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top16/Makefile b/hw/syn/quartus/top16/Makefile index 07fc721a..1ac30cba 100644 --- a/hw/syn/quartus/top16/Makefile +++ b/hw/syn/quartus/top16/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=4" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=4" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile index 624a06a5..089e94b6 100644 --- a/hw/syn/quartus/top2/Makefile +++ b/hw/syn/quartus/top2/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=2" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=2" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top32/Makefile b/hw/syn/quartus/top32/Makefile index 71860101..73fac3c4 100644 --- a/hw/syn/quartus/top32/Makefile +++ b/hw/syn/quartus/top32/Makefile @@ -1,16 +1,16 @@ -#FAMILY = "Arria 10" -#DEVICE = 10AX115N3F40E2SG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -FAMILY = "Stratix 10" -DEVICE = 1SX280HN2F43E2VG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +#FAMILY = "Arria 10" +#DEVICE = 10AX115N3F40E2SG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +FAMILY = "Stratix 10" +DEVICE = 1SX280HN2F43E2VG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=8" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=8" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top4/Makefile b/hw/syn/quartus/top4/Makefile index 64153ed9..b77edd01 100644 --- a/hw/syn/quartus/top4/Makefile +++ b/hw/syn/quartus/top4/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top64/Makefile b/hw/syn/quartus/top64/Makefile index 0318eb45..83057792 100644 --- a/hw/syn/quartus/top64/Makefile +++ b/hw/syn/quartus/top64/Makefile @@ -1,16 +1,16 @@ -#FAMILY = "Arria 10" -#DEVICE = 10AX115N3F40E2SG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -FAMILY = "Stratix 10" -DEVICE = 1SX280HN2F43E2VG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR=../../../../rtl + +#FAMILY = "Arria 10" +#DEVICE = 10AX115N3F40E2SG +#FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 + +FAMILY = "Stratix 10" +DEVICE = 1SX280HN2F43E2VG +FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=8" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=8" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index ff9bf0b2..fe4107d4 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;../../../rtl/afu;../../../rtl/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=2" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=2" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/unittest/Makefile b/hw/syn/quartus/unittest/Makefile index 9644cf52..43d17d0f 100644 --- a/hw/syn/quartus/unittest/Makefile +++ b/hw/syn/quartus/unittest/Makefile @@ -1,13 +1,19 @@ PROJECT = Unittest TOP_LEVEL_ENTITY = VX_cache_core_req_bank_sel SRC_FILE = VX_cache_core_req_bank_sel.v -FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera/arria10;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache -PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf +RTL_DIR = ../../../../rtl -# Part, Family FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 + +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on @@ -51,7 +57,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" syn.chg: $(STAMP) syn.chg diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 123a528e..6874cce3 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -1,16 +1,16 @@ -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG -FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 - -#FAMILY = "Stratix 10" -#DEVICE = 1SX280HN2F43E2VG -#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 - PROJECT = Vortex TOP_LEVEL_ENTITY = Vortex SRC_FILE = Vortex.sv +RTL_DIR = ../../../../rtl + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -RTL_DIR=../../../rtl FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) @@ -59,7 +59,7 @@ smart.log: $(PROJECT_FILES) # Project initialization $(PROJECT_FILES): - quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" syn.chg: $(STAMP) syn.chg diff --git a/hw/unit_tests/cache/Makefile b/hw/unit_tests/cache/Makefile index 508efc76..d430badc 100644 --- a/hw/unit_tests/cache/Makefile +++ b/hw/unit_tests/cache/Makefile @@ -2,7 +2,21 @@ TOP = VX_cache PARAMS += -DCACHE_SIZE=4096 -DWORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DNUM_BANKS=4 -DCREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 -INCLUDE = -I../../rtl/ -I../../rtl/libs -I../../rtl/cache +# control RTL debug print states +DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \ + -DDBG_PRINT_CORE_DCACHE \ + -DDBG_PRINT_CACHE_BANK \ + -DDBG_PRINT_CACHE_SNP \ + -DDBG_PRINT_CACHE_MSHR \ + -DDBG_PRINT_CACHE_TAG \ + -DDBG_PRINT_CACHE_DATA \ + -DDBG_PRINT_MEM \ + -DDBG_PRINT_OPAE \ + -DDBG_PRINT_AVS + +#DBG_PRINT=$(DBG_PRINT_FLAGS) + +INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs SRCS = cachesim.cpp testbench.cpp diff --git a/hw/unit_tests/cache/cachesim.cpp b/hw/unit_tests/cache/cachesim.cpp index 15bd1e45..736b5cb2 100644 --- a/hw/unit_tests/cache/cachesim.cpp +++ b/hw/unit_tests/cache/cachesim.cpp @@ -18,7 +18,7 @@ CacheSim::CacheSim() { ram_ = nullptr; cache_ = new VVX_cache(); - dram_rsp_active_ = false; + mem_rsp_active_ = false; snp_req_active_ = false; //#ifdef VCD_OUTPUT @@ -39,7 +39,7 @@ CacheSim::~CacheSim() { void CacheSim::attach_ram(RAM* ram) { ram_ = ram; - dram_rsp_vec_.clear(); + mem_rsp_vec_.clear(); } void CacheSim::reset() { @@ -52,7 +52,7 @@ void CacheSim::reset() { cache_->reset = 0; this->step(); - dram_rsp_vec_.clear(); + mem_rsp_vec_.clear(); //clear req and rsp vecs } @@ -66,10 +66,10 @@ void CacheSim::step() { cache_->clk = 1; this->eval(); - //handle core and dram reqs and rsps + //handle core and memory reqs and rsps this->eval_reqs(); this->eval_rsps(); - this->eval_dram_bus(); + this->eval_mem_bus(); timestamp++; } @@ -104,7 +104,7 @@ void CacheSim::run(){ } stalls--; if (stalls == 20){ - //stall_dram(); + //stall_mem(); //send_snoop_req(); stalls--; } @@ -168,8 +168,8 @@ void CacheSim::eval_rsps(){ } } -void CacheSim::stall_dram(){ - cache_->dram_req_ready = 0; +void CacheSim::stall_mem(){ + cache_->mem_req_ready = 0; } void CacheSim::send_snoop_req(){ @@ -179,81 +179,81 @@ void CacheSim::send_snoop_req(){ cache_->snp_req_tag = 0xff; */ } -void CacheSim::eval_dram_bus() { +void CacheSim::eval_mem_bus() { if (ram_ == nullptr) { - cache_->dram_req_ready = 0; + cache_->mem_req_ready = 0; return; } - // schedule DRAM responses + // schedule memory responses int dequeue_index = -1; - for (int i = 0; i < dram_rsp_vec_.size(); i++) { - if (dram_rsp_vec_[i].cycles_left > 0) { - dram_rsp_vec_[i].cycles_left -= 1; + for (int i = 0; i < mem_rsp_vec_.size(); i++) { + if (mem_rsp_vec_[i].cycles_left > 0) { + mem_rsp_vec_[i].cycles_left -= 1; } if ((dequeue_index == -1) - && (dram_rsp_vec_[i].cycles_left == 0)) { + && (mem_rsp_vec_[i].cycles_left == 0)) { dequeue_index = i; } } - // send DRAM response - if (dram_rsp_active_ - && cache_->dram_rsp_valid - && cache_->dram_rsp_ready) { - dram_rsp_active_ = false; + // send memory response + if (mem_rsp_active_ + && cache_->mem_rsp_valid + && cache_->mem_rsp_ready) { + mem_rsp_active_ = false; } - if (!dram_rsp_active_) { + if (!mem_rsp_active_) { if (dequeue_index != -1) { //time to respond to the request - cache_->dram_rsp_valid = 1; + cache_->mem_rsp_valid = 1; //copy data from the rsp queue to the cache module - memcpy((uint8_t*)cache_->dram_rsp_data, dram_rsp_vec_[dequeue_index].data, GLOBAL_BLOCK_SIZE); + memcpy((uint8_t*)cache_->mem_rsp_data, mem_rsp_vec_[dequeue_index].data, MEM_BLOCK_SIZE); - cache_->dram_rsp_tag = dram_rsp_vec_[dequeue_index].tag; - free(dram_rsp_vec_[dequeue_index].data); //take data out of the queue - dram_rsp_vec_.erase(dram_rsp_vec_.begin() + dequeue_index); - dram_rsp_active_ = true; + cache_->mem_rsp_tag = mem_rsp_vec_[dequeue_index].tag; + free(mem_rsp_vec_[dequeue_index].data); //take data out of the queue + mem_rsp_vec_.erase(mem_rsp_vec_.begin() + dequeue_index); + mem_rsp_active_ = true; } else { - cache_->dram_rsp_valid = 0; + cache_->mem_rsp_valid = 0; } } - // handle DRAM stalls - bool dram_stalled = false; -#ifdef ENABLE_DRAM_STALLS - if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) { - dram_stalled = true; + // handle memory stalls + bool mem_stalled = false; +#ifdef ENABLE_MEM_STALLS + if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { + mem_stalled = true; } else - if (dram_rsp_vec_.size() >= DRAM_RQ_SIZE) { - dram_stalled = true; + if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) { + mem_stalled = true; } #endif - // process DRAM requests - if (!dram_stalled) { - if (cache_->dram_req_valid) { - if (cache_->dram_req_rw) { //write = 1 - uint64_t byteen = cache_->dram_req_byteen; - unsigned base_addr = (cache_->dram_req_addr * GLOBAL_BLOCK_SIZE); - uint8_t* data = (uint8_t*)(cache_->dram_req_data); - for (int i = 0; i < GLOBAL_BLOCK_SIZE; i++) { + // process memory requests + if (!mem_stalled) { + if (cache_->mem_req_valid) { + if (cache_->mem_req_rw) { //write = 1 + uint64_t byteen = cache_->mem_req_byteen; + unsigned base_addr = (cache_->mem_req_addr * MEM_BLOCK_SIZE); + uint8_t* data = (uint8_t*)(cache_->mem_req_data); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { (*ram_)[base_addr + i] = data[i]; } } } else { - dram_req_t dram_req; - dram_req.cycles_left = DRAM_LATENCY; - dram_req.data = (uint8_t*)malloc(GLOBAL_BLOCK_SIZE); - dram_req.tag = cache_->dram_req_tag; - ram_->read(cache_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.data); - dram_rsp_vec_.push_back(dram_req); + mem_req_t mem_req; + mem_req.cycles_left = MEM_LATENCY; + mem_req.data = (uint8_t*)malloc(MEM_BLOCK_SIZE); + mem_req.tag = cache_->mem_req_tag; + ram_->read(cache_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data); + mem_rsp_vec_.push_back(mem_req); } } } - cache_->dram_req_ready = ~dram_stalled; + cache_->mem_req_ready = ~mem_stalled; } bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){ @@ -302,19 +302,19 @@ void CacheSim::get_core_rsp(){ std::cout << std::hex << "core_rsp_tag: " << cache_->core_rsp_tag << std::endl; } -void CacheSim::get_dram_req(){ - std::cout << std::hex << "dram_req_valid: " << cache_->dram_req_valid << std::endl; - std::cout << std::hex << "dram_req_rw: " << cache_->dram_req_rw << std::endl; - std::cout << std::hex << "dram_req_byteen: " << cache_->dram_req_byteen << std::endl; - std::cout << std::hex << "dram_req_addr: " << cache_->dram_req_addr << std::endl; - std::cout << std::hex << "dram_req_data: " << cache_->dram_req_data << std::endl; - std::cout << std::hex << "dram_req_tag: " << cache_->dram_req_tag << std::endl; +void CacheSim::get_mem_req(){ + std::cout << std::hex << "mem_req_valid: " << cache_->mem_req_valid << std::endl; + std::cout << std::hex << "mem_req_rw: " << cache_->mem_req_rw << std::endl; + std::cout << std::hex << "mem_req_byteen: " << cache_->mem_req_byteen << std::endl; + std::cout << std::hex << "mem_req_addr: " << cache_->mem_req_addr << std::endl; + std::cout << std::hex << "mem_req_data: " << cache_->mem_req_data << std::endl; + std::cout << std::hex << "mem_req_tag: " << cache_->mem_req_tag << std::endl; } -void CacheSim::get_dram_rsp(){ - std::cout << std::hex << "dram_rsp_valid: " << cache_->dram_rsp_valid << std::endl; - std::cout << std::hex << "dram_rsp_data: " << cache_->dram_rsp_data << std::endl; - std::cout << std::hex << "dram_rsp_tag: " << cache_->dram_rsp_tag << std::endl; - std::cout << std::hex << "dram_rsp_ready: " << cache_->dram_rsp_ready << std::endl; +void CacheSim::get_mem_rsp(){ + std::cout << std::hex << "mem_rsp_valid: " << cache_->mem_rsp_valid << std::endl; + std::cout << std::hex << "mem_rsp_data: " << cache_->mem_rsp_data << std::endl; + std::cout << std::hex << "mem_rsp_tag: " << cache_->mem_rsp_tag << std::endl; + std::cout << std::hex << "mem_rsp_ready: " << cache_->mem_rsp_ready << std::endl; } diff --git a/hw/unit_tests/cache/cachesim.h b/hw/unit_tests/cache/cachesim.h index e6324bf6..72cc44f9 100644 --- a/hw/unit_tests/cache/cachesim.h +++ b/hw/unit_tests/cache/cachesim.h @@ -14,17 +14,17 @@ #include #include -#define ENABLE_DRAM_STALLS -#define DRAM_LATENCY 100 -#define DRAM_RQ_SIZE 16 -#define DRAM_STALLS_MODULO 16 -#define GLOBAL_BLOCK_SIZE 16 +#define ENABLE_MEM_STALLS +#define MEM_LATENCY 100 +#define MEM_RQ_SIZE 16 +#define MEM_STALLS_MODULO 16 +#define MEM_BLOCK_SIZE 16 typedef struct { int cycles_left; uint8_t *data; unsigned tag; -} dram_req_t; +} mem_req_t; typedef struct { char valid; @@ -52,7 +52,7 @@ public: //req/rsp void send_req(core_req_t *req); void clear_req(); - void stall_dram(); + void stall_mem(); void send_snoop_req(); void send_snp_fwd_in(); @@ -60,12 +60,12 @@ public: bool assert_equal(unsigned int* data, unsigned int tag); //debug funcs - void get_dram_req(); + void get_mem_req(); void get_core_req(unsigned int (&rsp)[4]); void get_core_rsp(); bool get_core_req_ready(); bool get_core_rsp_ready(); - void get_dram_rsp(); + void get_mem_rsp(); void display_miss(); private: @@ -73,12 +73,12 @@ private: void eval(); void eval_reqs(); void eval_rsps(); - void eval_dram_bus(); + void eval_mem_bus(); std::queue core_req_vec_; - std::vector dram_rsp_vec_; + std::vector mem_rsp_vec_; std::map core_rsp_vec_; - int dram_rsp_active_; + int mem_rsp_active_; uint32_t snp_req_active_; uint32_t snp_req_size_; diff --git a/hw/unit_tests/cache/testbench.cpp b/hw/unit_tests/cache/testbench.cpp index c668f26a..4a38bb7c 100644 --- a/hw/unit_tests/cache/testbench.cpp +++ b/hw/unit_tests/cache/testbench.cpp @@ -175,7 +175,7 @@ int FLUSH(CacheSim *sim){ int BACK_PRESSURE(CacheSim *sim){ - //happens whenever the core is stalled or DRAM is stalled + //happens whenever the core is stalled or memory is stalled unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; unsigned int rsp[4] = {0,0,0,0}; diff --git a/simX/Makefile b/simX/Makefile index 76fc6e7e..cc443f6a 100644 --- a/simX/Makefile +++ b/simX/Makefile @@ -1,12 +1,10 @@ #CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized +CXXFLAGS += -Wno-maybe-uninitialized CXXFLAGS += -I. -I../hw CXXFLAGS += -DDUMP_PERF_STATS -LDFLAGS += - TOP = vx_cache_sim RTL_DIR = ../hw/rtl