diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 3bc7e4e7..8eeaa44e 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -13,7 +13,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -#DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO #CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 diff --git a/driver/tests/dogfood/dogfood.cpp b/driver/tests/dogfood/dogfood.cpp index c54fcbf4..1a62959f 100644 --- a/driver/tests/dogfood/dogfood.cpp +++ b/driver/tests/dogfood/dogfood.cpp @@ -77,6 +77,7 @@ const char* kernel_file = "kernel.bin"; int count = 0; int testid_s = 0; int testid_e = (testMngr.size() - 1); +bool stop_on_error = true; vx_device_h device = nullptr; vx_buffer_h arg_buf = nullptr; @@ -86,12 +87,12 @@ vx_buffer_h dst_buf = nullptr; static void show_usage() { std::cout << "Vortex Driver Test." << std::endl; - std::cout << "Usage: [-s:testid] [-e:testid] [-k: kernel] [-n words] [-h: help]" << std::endl; + std::cout << "Usage: [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "n:s:e:k:h?")) != -1) { + while ((c = getopt(argc, argv, "n:s:e:k:ch?")) != -1) { switch (c) { case 'n': count = atoi(optarg); @@ -105,6 +106,9 @@ static void parse_args(int argc, char **argv) { case 'k': kernel_file = optarg; break; + case 'c': + stop_on_error = false; + break; case 'h': case '?': { show_usage(); @@ -136,6 +140,7 @@ void cleanup() { } int main(int argc, char *argv[]) { + int exitcode = 0; size_t value; kernel_arg_t kernel_arg; @@ -146,6 +151,8 @@ int main(int argc, char *argv[]) { count = 1; } + std::cout << std::dec; + std::cout << "test ids: " << testid_s << " - " << testid_e << std::endl; std::cout << "workitem size: " << count << std::endl; std::cout << "using kernel: " << kernel_file << std::endl; @@ -163,9 +170,7 @@ int main(int argc, char *argv[]) { size_t buf_size = num_points * sizeof(uint32_t); std::cout << "number of points: " << num_points << std::endl; - std::cout << "number of points: " << num_points << std::endl; - std::cout << "number of points: " << num_points << std::endl; - std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + std::cout << "buffer size: " << std::hex << buf_size << std::dec << " bytes" << std::endl; // upload program std::cout << "upload kernel" << std::endl; @@ -183,9 +188,9 @@ int main(int argc, char *argv[]) { kernel_arg.count = count; - std::cout << "dev_src0=" << std::hex << kernel_arg.src0_ptr << std::endl; - std::cout << "dev_src1=" << std::hex << kernel_arg.src1_ptr << std::endl; - std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl; + std::cout << "dev_src0=" << std::hex << kernel_arg.src0_ptr << std::dec << std::endl; + std::cout << "dev_src1=" << std::hex << kernel_arg.src1_ptr << std::dec << std::endl; + std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::dec << std::endl; // allocate shared memory std::cout << "allocate shared memory" << std::endl; @@ -250,15 +255,19 @@ int main(int argc, char *argv[]) { if (errors != 0) { std::cout << "found " << errors << " errors!" << std::endl; std::cout << "FAILED!" << std::endl << std::flush; - cleanup(); - exit(1); + if (stop_on_error) { + cleanup(); + exit(1); + } + exitcode = 1; + } else { + std::cout << "PASSED!" << std::endl << std::flush; } - std::cout << "PASSED!" << std::endl << std::flush; } // cleanup std::cout << "cleanup" << std::endl; cleanup(); - return 0; + return exitcode; } \ No newline at end of file diff --git a/driver/tests/dogfood/kernel.c b/driver/tests/dogfood/kernel.c index 2fc49872..28f4f992 100644 --- a/driver/tests/dogfood/kernel.c +++ b/driver/tests/dogfood/kernel.c @@ -131,7 +131,7 @@ void kernel_fmadd(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = a * 0.5f + b; + float c = a * b + 0.5f; dst_ptr[offset+i] = c; } } @@ -147,7 +147,7 @@ void kernel_fmsub(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = a * 0.5f - b; + float c = a * b - 0.5f; dst_ptr[offset+i] = c; } } @@ -163,7 +163,7 @@ void kernel_fnmadd(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = -a * 0.5f - b; + float c = -a * b - 0.5f; dst_ptr[offset+i] = c; } } @@ -179,7 +179,7 @@ void kernel_fnmsub(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = -a * 0.5f + b; + float c = -a * b + 0.5f; dst_ptr[offset+i] = c; } } @@ -195,8 +195,8 @@ void kernel_fnmadd_madd(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = -a * 0.25f - b; - float d = a * 0.25f + b; + float c =-a * b - 0.5f; + float d = a * b + 0.5f; float e = c + d; dst_ptr[offset+i] = e; } @@ -247,7 +247,7 @@ void kernel_fsqrt(void* arg) { for (uint32_t i = 0; i < count; ++i) { float a = src0_ptr[offset+i]; float b = src1_ptr[offset+i]; - float c = sqrt(a) + b; + float c = sqrt(a * b); dst_ptr[offset+i] = c; } } @@ -289,38 +289,34 @@ void kernel_ftou(void* arg) { void kernel_itof(void* arg) { struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg); uint32_t count = _arg->count; - float* src0_ptr = (float*)_arg->src0_ptr; - float* src1_ptr = (float*)_arg->src1_ptr; + int32_t* src0_ptr = (int32_t*)_arg->src0_ptr; + int32_t* src1_ptr = (int32_t*)_arg->src1_ptr; float* dst_ptr = (float*)_arg->dst_ptr; uint32_t offset = vx_thread_gid() * count; for (uint32_t i = 0; i < count; ++i) { - float a = src0_ptr[offset+i]; - float b = src1_ptr[offset+i]; - int32_t c = (int32_t)a; - int32_t d = (int32_t)b; - int32_t e = c + d; - float f = (float)e; - dst_ptr[offset+i] = f; + int32_t a = src0_ptr[offset+i]; + int32_t b = src1_ptr[offset+i]; + int32_t c = a + b; + float d = (float)c; + dst_ptr[offset+i] = d; } } void kernel_utof(void* arg) { struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg); uint32_t count = _arg->count; - float* src0_ptr = (float*)_arg->src0_ptr; - float* src1_ptr = (float*)_arg->src1_ptr; + int32_t* src0_ptr = (int32_t*)_arg->src0_ptr; + int32_t* src1_ptr = (int32_t*)_arg->src1_ptr; float* dst_ptr = (float*)_arg->dst_ptr; uint32_t offset = vx_thread_gid() * count; for (uint32_t i = 0; i < count; ++i) { - float a = src0_ptr[offset+i]; - float b = src1_ptr[offset+i]; - uint32_t c = (uint32_t)a; - uint32_t d = (uint32_t)b; - uint32_t e = c + d; - float f = (float)e; - dst_ptr[offset+i] = f; + int32_t a = src0_ptr[offset+i]; + int32_t b = src1_ptr[offset+i]; + uint32_t c = a + b; + float d = (float)c; + dst_ptr[offset+i] = d; } } diff --git a/driver/tests/dogfood/testcases.h b/driver/tests/dogfood/testcases.h index e2718a82..4669c857 100644 --- a/driver/tests/dogfood/testcases.h +++ b/driver/tests/dogfood/testcases.h @@ -2,6 +2,31 @@ #include #include +#include + +union Float_t { + float f; + int32_t i; + struct { + uint32_t mantissa : 23; + uint32_t exponent : 8; + uint32_t sign : 1; + } parts; +}; + +inline bool almost_equal_eps(float a, float b, float eps = std::numeric_limits::epsilon()) { + auto tolerance = std::max(std::fabs(a), std::fabs(b)) * eps; + return std::fabs(a - b) <= tolerance; +} + +inline bool almost_equal_ulp(float a, float b, int32_t ulp = 4) { + Float_t fa{a}, fb{b}; + return std::abs(fa.i - fb.i) <= ulp; +} + +inline bool almost_equal(float a, float b) { + return almost_equal_ulp(a, b); +} class ITestCase { public: @@ -19,8 +44,8 @@ public: auto a = (int32_t*)src1; auto b = (int32_t*)src2; for (int i = 0; i < n; ++i) { - a[i] = n/2 + i; - b[i] = n/2 - i; + a[i] = n/2 - i; + b[i] = n/2 + i; } } @@ -32,7 +57,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] + b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -47,8 +72,8 @@ public: auto a = (int32_t*)src1; auto b = (int32_t*)src2; for (int i = 0; i < n; ++i) { - a[i] = n/2 + i; - b[i] = n/2 - i; + a[i] = n/2 - i; + b[i] = n/2 + i; } } @@ -60,7 +85,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -88,7 +113,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] / b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -118,7 +143,7 @@ public: auto y = a[i] * b[i]; auto ref = x + y; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -133,8 +158,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -145,8 +170,8 @@ public: auto c = (float*)dst; for (int i = 0; i < n; ++i) { auto ref = a[i] + b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -161,8 +186,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -173,8 +198,8 @@ public: auto c = (float*)dst; for (int i = 0; i < n; ++i) { auto ref = a[i] - b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -189,8 +214,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -201,8 +226,8 @@ public: auto c = (float*)dst; for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -217,8 +242,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -228,9 +253,9 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto ref = a[i] * 0.5f + b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + auto ref = a[i] * b[i] + 0.5f; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -245,8 +270,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -256,9 +281,9 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto ref = a[i] * 0.5f - b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + auto ref = a[i] * b[i] - 0.5f; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -273,8 +298,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -284,9 +309,9 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto ref = -a[i] * 0.5f - b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + auto ref = -a[i] * b[i] - 0.5f; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -301,8 +326,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -312,9 +337,9 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto ref = -a[i] * 0.5f + b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + auto ref = -a[i] * b[i] + 0.5f; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -329,8 +354,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -340,11 +365,11 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto x = -a[i] * 0.5f - b[i]; - auto y = a[i] * 0.5f + b[i]; + auto x = -a[i] * b[i] - 0.5f; + auto y = a[i] * b[i] + 0.5f; auto ref = x + y; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -359,8 +384,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n - i) * 0.125f; - b[i] = (n + i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -371,8 +396,8 @@ public: auto c = (float*)dst; for (int i = 0; i < n; ++i) { auto ref = a[i] / b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -387,8 +412,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n - i) * 0.125f; - b[i] = (n + i) * 0.125f; + a[i] = (n - i) * (1.0f/n); + b[i] = (n + i) * (1.0f/n); } } @@ -401,8 +426,8 @@ public: auto x = a[i] / b[i]; auto y = b[i] / a[i]; auto ref = x + y; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -417,8 +442,9 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.125f; - b[i] = (n - i) * 0.125f; + int q = 1.0f + (i % 64); + a[i] = q; + b[i] = q; } } @@ -428,9 +454,9 @@ public: auto b = (float*)src2; auto c = (float*)dst; for (int i = 0; i < n; ++i) { - auto ref = sqrt(a[i]) + b[i]; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + auto ref = sqrt(a[i] * b[i]); + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -445,8 +471,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.5f; - b[i] = (n - i) * 0.5f; + a[i] = (n/2 - i) * (1.0f/n); + b[i] = (n/2 - i) * (1.0f/n); } } @@ -454,12 +480,12 @@ public: int errors = 0; auto a = (float*)src1; auto b = (float*)src2; - auto c = (float*)dst; + auto c = (int32_t*)dst; for (int i = 0; i < n; ++i) { auto x = a[i] + b[i]; auto ref = (int32_t)x; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -474,8 +500,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (int i = 0; i < n; ++i) { - a[i] = (n + i) * 0.5f; - b[i] = (n - i) * 0.5f; + a[i] = i * (1.0f/n); + b[i] = i * (1.0f/n); } } @@ -483,12 +509,12 @@ public: int errors = 0; auto a = (float*)src1; auto b = (float*)src2; - auto c = (float*)dst; + auto c = (uint32_t*)dst; for (int i = 0; i < n; ++i) { auto x = a[i] + b[i]; auto ref = (uint32_t)x; if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -503,7 +529,7 @@ public: auto a = (int32_t*)src1; auto b = (int32_t*)src2; for (int i = 0; i < n; ++i) { - a[i] = n/2 + i; + a[i] = n/2 - i; b[i] = n/2 - i; } } @@ -516,8 +542,8 @@ public: for (int i = 0; i < n; ++i) { auto x = a[i] + b[i]; auto ref = (float)x; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -532,8 +558,8 @@ public: auto a = (uint32_t*)src1; auto b = (uint32_t*)src2; for (int i = 0; i < n; ++i) { - a[i] = n/2 + i; - b[i] = n/2 - i; + a[i] = i; + b[i] = i; } } @@ -545,8 +571,8 @@ public: for (int i = 0; i < n; ++i) { auto x = a[i] + b[i]; auto ref = (float)x; - if (c[i] != ref) { - std::cout << "error at value " << i << ": actual 0x" << c[i] << ", expected 0x" << ref << std::endl; + if (!almost_equal(c[i], ref)) { + std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 278ee190..a059e8e7 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -988,32 +988,32 @@ Vortex #() vortex ( localparam SCOPE_DATAW = $bits({`SCOPE_SIGNALS_DATA_LIST `SCOPE_SIGNALS_UPD_LIST}); localparam SCOPE_SR_DEPTH = 2; -`SCOPE_ASSIGN(scope_dram_req_valid, vx_dram_req_valid); -`SCOPE_ASSIGN(scope_dram_req_addr, {vx_dram_req_addr, 4'b0}); -`SCOPE_ASSIGN(scope_dram_req_rw, vx_dram_req_rw); -`SCOPE_ASSIGN(scope_dram_req_byteen,vx_dram_req_byteen); -`SCOPE_ASSIGN(scope_dram_req_data, vx_dram_req_data); -`SCOPE_ASSIGN(scope_dram_req_tag, vx_dram_req_tag); -`SCOPE_ASSIGN(scope_dram_req_ready, vx_dram_req_ready); +`SCOPE_ASSIGN (scope_dram_req_valid, vx_dram_req_valid); +`SCOPE_ASSIGN (scope_dram_req_addr, {vx_dram_req_addr, 4'b0}); +`SCOPE_ASSIGN (scope_dram_req_rw, vx_dram_req_rw); +`SCOPE_ASSIGN (scope_dram_req_byteen,vx_dram_req_byteen); +`SCOPE_ASSIGN (scope_dram_req_data, vx_dram_req_data); +`SCOPE_ASSIGN (scope_dram_req_tag, vx_dram_req_tag); +`SCOPE_ASSIGN (scope_dram_req_ready, vx_dram_req_ready); -`SCOPE_ASSIGN(scope_dram_rsp_valid, vx_dram_rsp_valid); -`SCOPE_ASSIGN(scope_dram_rsp_data, vx_dram_rsp_data); -`SCOPE_ASSIGN(scope_dram_rsp_tag, vx_dram_rsp_tag); -`SCOPE_ASSIGN(scope_dram_rsp_ready, vx_dram_rsp_ready); +`SCOPE_ASSIGN (scope_dram_rsp_valid, vx_dram_rsp_valid); +`SCOPE_ASSIGN (scope_dram_rsp_data, vx_dram_rsp_data); +`SCOPE_ASSIGN (scope_dram_rsp_tag, vx_dram_rsp_tag); +`SCOPE_ASSIGN (scope_dram_rsp_ready, vx_dram_rsp_ready); -`SCOPE_ASSIGN(scope_snp_req_valid, vx_snp_req_valid); -`SCOPE_ASSIGN(scope_snp_req_addr, {vx_snp_req_addr, 4'b0}); -`SCOPE_ASSIGN(scope_snp_req_invalidate, vx_snp_req_invalidate); -`SCOPE_ASSIGN(scope_snp_req_tag, vx_snp_req_tag); -`SCOPE_ASSIGN(scope_snp_req_ready, vx_snp_req_ready); +`SCOPE_ASSIGN (scope_snp_req_valid, vx_snp_req_valid); +`SCOPE_ASSIGN (scope_snp_req_addr, {vx_snp_req_addr, 4'b0}); +`SCOPE_ASSIGN (scope_snp_req_invalidate, vx_snp_req_invalidate); +`SCOPE_ASSIGN (scope_snp_req_tag, vx_snp_req_tag); +`SCOPE_ASSIGN (scope_snp_req_ready, vx_snp_req_ready); -`SCOPE_ASSIGN(scope_snp_rsp_valid, vx_snp_rsp_valid); -`SCOPE_ASSIGN(scope_snp_rsp_tag, vx_snp_rsp_tag); -`SCOPE_ASSIGN(scope_snp_rsp_ready, vx_snp_rsp_ready); +`SCOPE_ASSIGN (scope_snp_rsp_valid, vx_snp_rsp_valid); +`SCOPE_ASSIGN (scope_snp_rsp_tag, vx_snp_rsp_tag); +`SCOPE_ASSIGN (scope_snp_rsp_ready, vx_snp_rsp_ready); -`SCOPE_ASSIGN(scope_snp_rsp_valid, vx_snp_rsp_valid); -`SCOPE_ASSIGN(scope_snp_rsp_tag, vx_snp_rsp_tag); -`SCOPE_ASSIGN(scope_snp_rsp_ready, vx_snp_rsp_ready); +`SCOPE_ASSIGN (scope_snp_rsp_valid, vx_snp_rsp_valid); +`SCOPE_ASSIGN (scope_snp_rsp_tag, vx_snp_rsp_tag); +`SCOPE_ASSIGN (scope_snp_rsp_ready, vx_snp_rsp_ready); wire scope_changed = (scope_icache_req_valid && scope_icache_req_ready) || (scope_icache_rsp_valid && scope_icache_rsp_ready) diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index ecb1038a..5c71df81 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -15,7 +15,7 @@ module VX_decode #( VX_wstall_if wstall_if, VX_join_if join_if ); - wire in_valid = ifetch_rsp_if.valid; + wire valid_in = ifetch_rsp_if.valid; wire [31:0] instr = ifetch_rsp_if.instr; reg [`ALU_BITS-1:0] alu_op; @@ -352,10 +352,10 @@ module VX_decode #( assign decode_tmp_if.frm = func3; - assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN); + assign join_if.is_join = valid_in && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.warp_num = ifetch_rsp_if.warp_num; - assign wstall_if.wstall = in_valid && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); + assign wstall_if.wstall = valid_in && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); assign wstall_if.warp_num = ifetch_rsp_if.warp_num; wire stall = ~decode_if.ready && decode_if.valid; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 8881413b..20e02bd0 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -77,6 +77,8 @@ `define INST_GPU 7'b1101011 +/////////////////////////////////////////////////////////////////////////////// + `define BYTEEN_SB 3'h0 `define BYTEEN_SH 3'h1 `define BYTEEN_SW 3'h2 @@ -85,6 +87,8 @@ `define BYTEEN_BITS 3 `define BYTEEN_TYPE(x) x[1:0] +/////////////////////////////////////////////////////////////////////////////// + `define BR_EQ 4'h0 `define BR_NE 4'h1 `define BR_LT 4'h2 @@ -101,6 +105,22 @@ `define BR_NO 4'hF `define BR_BITS 4 +/////////////////////////////////////////////////////////////////////////////// + +`define EX_NOP 3'h0 +`define EX_ALU 3'h1 +`define EX_LSU 3'h2 +`define EX_CSR 3'h3 +`define EX_MUL 3'h4 +`define EX_FPU 3'h5 +`define EX_GPU 3'h6 +`define EX_BITS 3 + +`define NUM_EXS 6 +`define NE_BITS `LOG2UP(`NUM_EXS) + +/////////////////////////////////////////////////////////////////////////////// + `define OP_BITS 5 `define ALU_ADD 5'h00 @@ -210,18 +230,6 @@ `define GPU_BITS 3 `define GPU_OP(x) x[`GPU_BITS-1:0] -`define EX_NOP 3'h0 -`define EX_ALU 3'h1 -`define EX_LSU 3'h2 -`define EX_CSR 3'h3 -`define EX_MUL 3'h4 -`define EX_FPU 3'h5 -`define EX_GPU 3'h6 -`define EX_BITS 3 - -`define NUM_EXS 6 -`define NE_BITS `LOG2UP(`NUM_EXS) - /////////////////////////////////////////////////////////////////////////////// `ifdef EXT_M_ENABLE diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 3ace93f6..81c76419 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -123,25 +123,25 @@ module VX_execute #( assign ebreak = alu_req_if.valid && (alu_req_if.alu_op == `ALU_EBREAK || alu_req_if.alu_op == `ALU_ECALL); - `SCOPE_ASSIGN(scope_decode_valid, decode_if.valid); - `SCOPE_ASSIGN(scope_decode_warp_num, decode_if.warp_num); - `SCOPE_ASSIGN(scope_decode_curr_PC, decode_if.curr_PC); - `SCOPE_ASSIGN(scope_decode_is_jal, decode_if.is_jal); - `SCOPE_ASSIGN(scope_decode_rs1, decode_if.rs1); - `SCOPE_ASSIGN(scope_decode_rs2, decode_if.rs2); + `SCOPE_ASSIGN (scope_decode_valid, decode_if.valid); + `SCOPE_ASSIGN (scope_decode_warp_num, decode_if.warp_num); + `SCOPE_ASSIGN (scope_decode_curr_PC, decode_if.curr_PC); + `SCOPE_ASSIGN (scope_decode_is_jal, decode_if.is_jal); + `SCOPE_ASSIGN (scope_decode_rs1, decode_if.rs1); + `SCOPE_ASSIGN (scope_decode_rs2, decode_if.rs2); - `SCOPE_ASSIGN(scope_execute_valid, alu_req_if.valid); - `SCOPE_ASSIGN(scope_execute_warp_num, alu_req_if.warp_num); - `SCOPE_ASSIGN(scope_execute_curr_PC, alu_req_if.curr_PC); - `SCOPE_ASSIGN(scope_execute_rd, alu_req_if.rd); - `SCOPE_ASSIGN(scope_execute_a, alu_req_if.rs1_data); - `SCOPE_ASSIGN(scope_execute_b, alu_req_if.rs2_data); + `SCOPE_ASSIGN (scope_execute_valid, alu_req_if.valid); + `SCOPE_ASSIGN (scope_execute_warp_num, alu_req_if.warp_num); + `SCOPE_ASSIGN (scope_execute_curr_PC, alu_req_if.curr_PC); + `SCOPE_ASSIGN (scope_execute_rd, alu_req_if.rd); + `SCOPE_ASSIGN (scope_execute_a, alu_req_if.rs1_data); + `SCOPE_ASSIGN (scope_execute_b, alu_req_if.rs2_data); - `SCOPE_ASSIGN(scope_writeback_valid, writeback_if.valid); - `SCOPE_ASSIGN(scope_writeback_warp_num, writeback_if.warp_num); - `SCOPE_ASSIGN(scope_writeback_curr_PC, writeback_if.curr_PC); - `SCOPE_ASSIGN(scope_writeback_wb, writeback_if.wb); - `SCOPE_ASSIGN(scope_writeback_rd, writeback_if.rd); - `SCOPE_ASSIGN(scope_writeback_data, writeback_if.data); + `SCOPE_ASSIGN (scope_writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (scope_writeback_warp_num, writeback_if.warp_num); + `SCOPE_ASSIGN (scope_writeback_curr_PC, writeback_if.curr_PC); + `SCOPE_ASSIGN (scope_writeback_wb, writeback_if.wb); + `SCOPE_ASSIGN (scope_writeback_rd, writeback_if.rd); + `SCOPE_ASSIGN (scope_writeback_data, writeback_if.data); endmodule diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 09695bf5..cc8de7dc 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -24,10 +24,10 @@ module VX_fpu_unit #( .clk (clk), .reset (reset), - .in_valid (fpu_req_if.valid), - .in_ready (fpu_req_if.ready), + .valid_in (fpu_req_if.valid), + .ready_in (fpu_req_if.ready), - .in_tag (fpu_req_if.issue_tag), + .tag_in (fpu_req_if.issue_tag), .op (fpu_req_if.fpu_op), .frm (frm), @@ -40,10 +40,10 @@ module VX_fpu_unit #( .has_fflags (fpu_commit_if.has_fflags), .fflags (fpu_commit_if.fflags), - .out_tag (fpu_commit_if.issue_tag), + .tag_out (fpu_commit_if.issue_tag), - .out_ready (fpu_commit_if.ready), - .out_valid (fpu_commit_if.valid) + .ready_out (fpu_commit_if.ready), + .valid_out (fpu_commit_if.valid) ); `else @@ -57,10 +57,10 @@ module VX_fpu_unit #( .clk (clk), .reset (reset), - .in_valid (fpu_req_if.valid), - .in_ready (fpu_req_if.ready), + .valid_in (fpu_req_if.valid), + .ready_in (fpu_req_if.ready), - .in_tag (fpu_req_if.issue_tag), + .tag_in (fpu_req_if.issue_tag), .op (fpu_req_if.fpu_op), .frm (frm), @@ -73,10 +73,10 @@ module VX_fpu_unit #( .has_fflags (fpu_commit_if.has_fflags), .fflags (fpu_commit_if.fflags), - .out_tag (fpu_commit_if.issue_tag), + .tag_out (fpu_commit_if.issue_tag), - .out_ready (fpu_commit_if.ready), - .out_valid (fpu_commit_if.valid) + .ready_out (fpu_commit_if.ready), + .valid_out (fpu_commit_if.valid) ); `endif diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index 55e3873f..0e9dbe99 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -18,14 +18,14 @@ module VX_gpr_fp_ctrl ( reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data; reg read_rs3; - wire delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3; + wire rs3_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3; - wire read_fire = gpr_read_if.valid && gpr_read_if.out_ready; + wire read_fire = gpr_read_if.valid && read_rs3; always @(posedge clk) begin if (reset) begin read_rs3 <= 0; - end else if (delay) begin + end else if (rs3_delay) begin read_rs3 <= 1; end else if (read_fire) begin read_rs3 <= 0; @@ -34,14 +34,14 @@ module VX_gpr_fp_ctrl ( // backup original rs1 data always @(posedge clk) begin - if (delay) begin + if (rs3_delay) begin tmp_rs1_data <= rs1_data; end end // outputs assign raddr1 = {gpr_read_if.warp_num, (read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1)}; - assign gpr_read_if.in_ready = ~delay; + assign gpr_read_if.ready = ~rs3_delay; assign gpr_read_if.rs1_data = gpr_read_if.use_rs3 ? tmp_rs1_data : rs1_data; assign gpr_read_if.rs2_data = rs2_data; assign gpr_read_if.rs3_data = rs1_data; diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index 00c5f1f0..bceb7d01 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -14,22 +14,24 @@ module VX_gpr_ram ( reg [`NUM_THREADS-1:0][3:0][7:0] ram [(`NUM_WARPS * `NUM_REGS)-1:0]; - integer i, j; - initial begin - // initialize r0 to 0 - for (j = 0; j < `NUM_WARPS; j++) begin - for (i = 0; i < `NUM_THREADS; i++) begin - ram[j * `NUM_REGS][i][0] = 8'h0; - ram[j * `NUM_REGS][i][1] = 8'h0; - ram[j * `NUM_REGS][i][2] = 8'h0; - ram[j * `NUM_REGS][i][3] = 8'h0; + // initialize ram + for (integer j = 0; j < `NUM_WARPS; j++) begin + for (integer i = 0; i < `NUM_REGS; i++) begin + if (i == 0) begin + ram[j * `NUM_REGS + i] = {`NUM_THREADS{32'h00000000}}; // set r0 = 0 + end + `ifndef SYNTHESIS + else begin + ram[j * `NUM_REGS + i] = {`NUM_THREADS{32'hdeadbeef}}; + end + `endif end end end always @(posedge clk) begin - for (i = 0; i < `NUM_THREADS; i++) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin if (we[i]) begin ram[waddr][i][0] <= wdata[i][07:00]; ram[waddr][i][1] <= wdata[i][15:08]; diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 9e49ca57..8a05137f 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -43,18 +43,16 @@ module VX_gpr_stage #( assign gpr_read_if.rs1_data = rs1_data; assign gpr_read_if.rs2_data = rs2_data; assign gpr_read_if.rs3_data = 0; - assign gpr_read_if.in_ready = 1; + assign gpr_read_if.ready = 1; wire valid = gpr_read_if.valid; - wire out_ready = gpr_read_if.out_ready; wire use_rs3 = gpr_read_if.use_rs3; wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3; `UNUSED_VAR (valid); - `UNUSED_VAR (out_ready); `UNUSED_VAR (use_rs3); `UNUSED_VAR (rs3); `endif - assign writeback_if.ready = 1'b1; + assign writeback_if.ready = 1'b1; // writes are stall-free endmodule diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index fad3fcd4..276a24a2 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -60,16 +60,16 @@ module VX_icache_stage #( // Can accept new response? assign icache_rsp_if.ready = ifetch_rsp_if.ready; - `SCOPE_ASSIGN(scope_icache_req_valid, icache_req_if.valid); - `SCOPE_ASSIGN(scope_icache_req_warp_num, ifetch_req_if.warp_num); - `SCOPE_ASSIGN(scope_icache_req_addr, {icache_req_if.addr, 2'b0}); - `SCOPE_ASSIGN(scope_icache_req_tag, icache_req_if.tag); - `SCOPE_ASSIGN(scope_icache_req_ready, icache_req_if.ready); + `SCOPE_ASSIGN (scope_icache_req_valid, icache_req_if.valid); + `SCOPE_ASSIGN (scope_icache_req_warp_num, ifetch_req_if.warp_num); + `SCOPE_ASSIGN (scope_icache_req_addr, {icache_req_if.addr, 2'b0}); + `SCOPE_ASSIGN (scope_icache_req_tag, icache_req_if.tag); + `SCOPE_ASSIGN (scope_icache_req_ready, icache_req_if.ready); - `SCOPE_ASSIGN(scope_icache_rsp_valid, icache_rsp_if.valid); - `SCOPE_ASSIGN(scope_icache_rsp_data, icache_rsp_if.data); - `SCOPE_ASSIGN(scope_icache_rsp_tag, icache_rsp_if.tag); - `SCOPE_ASSIGN(scope_icache_rsp_ready, icache_rsp_if.ready); + `SCOPE_ASSIGN (scope_icache_rsp_valid, icache_rsp_if.valid); + `SCOPE_ASSIGN (scope_icache_rsp_data, icache_rsp_if.data); + `SCOPE_ASSIGN (scope_icache_rsp_tag, icache_rsp_if.tag); + `SCOPE_ASSIGN (scope_icache_rsp_ready, icache_rsp_if.ready); `ifdef DBG_PRINT_CORE_ICACHE always @(posedge clk) begin diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 3c1b9171..295434cd 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -17,26 +17,27 @@ module VX_issue #( VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if ); + + wire [`ISTAG_BITS-1:0] issue_tag; + wire schedule_delay; + VX_gpr_read_if gpr_read_if(); - assign gpr_read_if.valid = decode_if.valid; + assign gpr_read_if.valid = decode_if.valid && ~schedule_delay; assign gpr_read_if.warp_num = decode_if.warp_num; assign gpr_read_if.rs1 = decode_if.rs1; assign gpr_read_if.rs2 = decode_if.rs2; assign gpr_read_if.rs3 = decode_if.rs3; assign gpr_read_if.use_rs3 = decode_if.use_rs3; - assign gpr_read_if.out_ready = decode_if.ready; - - wire [`ISTAG_BITS-1:0] issue_tag, issue_tmp_tag; - - wire schedule_delay; - - wire gpr_busy = ~gpr_read_if.in_ready; wire ex_busy = (~alu_req_if.ready && (decode_if.ex_type == `EX_ALU)) || (~lsu_req_if.ready && (decode_if.ex_type == `EX_LSU)) || (~csr_req_if.ready && (decode_if.ex_type == `EX_CSR)) + `ifdef EXT_M_ENABLE || (~mul_req_if.ready && (decode_if.ex_type == `EX_MUL)) + `endif + `ifdef EXT_F_ENABLE || (~fpu_req_if.ready && (decode_if.ex_type == `EX_FPU)) + `endif || (~gpu_req_if.ready && (decode_if.ex_type == `EX_GPU)); VX_scheduler #( @@ -47,8 +48,7 @@ module VX_issue #( .decode_if (decode_if), .writeback_if (writeback_if), .cmt_to_issue_if(cmt_to_issue_if), - .ex_busy (ex_busy), - .gpr_busy (gpr_busy), + .ex_busy (ex_busy), .issue_tag (issue_tag), .schedule_delay (schedule_delay) ); @@ -62,56 +62,117 @@ module VX_issue #( .gpr_read_if (gpr_read_if) ); - VX_decode_if decode_tmp_if(); - VX_gpr_read_if gpr_read_tmp_if(); - - wire stall = schedule_delay; - wire flush = schedule_delay && ~ex_busy; - - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) - ) issue_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (flush), - .in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}), - .out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, gpr_read_tmp_if.rs1_data, gpr_read_tmp_if.rs2_data, gpr_read_tmp_if.rs3_data}) - ); - - assign decode_if.ready = ~stall; + VX_alu_req_if alu_req_tmp_if(); + VX_lsu_req_if lsu_req_tmp_if(); + VX_csr_req_if csr_req_tmp_if(); + VX_mul_req_if mul_req_tmp_if(); + VX_fpu_req_if fpu_req_tmp_if(); + VX_gpu_req_if gpu_req_tmp_if(); VX_issue_demux issue_demux ( - .decode_if (decode_tmp_if), - .gpr_read_if(gpr_read_tmp_if), - .issue_tag (issue_tmp_tag), - .alu_req_if (alu_req_if), - .lsu_req_if (lsu_req_if), - .csr_req_if (csr_req_if), - .mul_req_if (mul_req_if), - .fpu_req_if (fpu_req_if), - .gpu_req_if (gpu_req_if) - ); + .decode_if (decode_if), + .gpr_read_if(gpr_read_if), + .issue_tag (issue_tag), + .alu_req_if (alu_req_tmp_if), + .lsu_req_if (lsu_req_tmp_if), + .csr_req_if (csr_req_tmp_if), + .mul_req_if (mul_req_tmp_if), + .fpu_req_if (fpu_req_tmp_if), + .gpu_req_if (gpu_req_tmp_if) + ); + + wire stall = schedule_delay || ~gpr_read_if.ready; + assign decode_if.ready = ~stall; + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `ALU_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32) + ) alu_reg ( + .clk (clk), + .reset (reset), + .stall (~alu_req_if.ready), + .flush (stall && alu_req_if.ready), + .in ({alu_req_tmp_if.valid, alu_req_tmp_if.issue_tag, alu_req_tmp_if.warp_num, alu_req_tmp_if.curr_PC, alu_req_tmp_if.thread_mask, alu_req_tmp_if.alu_op, alu_req_tmp_if.rs1_data, alu_req_tmp_if.rs2_data, alu_req_tmp_if.offset, alu_req_tmp_if.next_PC}), + .out ({alu_req_if.valid, alu_req_if.issue_tag, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.thread_mask, alu_req_if.alu_op, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC}) + ); + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + 1 + `BYTEEN_BITS + (`NUM_THREADS * 32) + 32 + (`NUM_THREADS * 32) + `NR_BITS + 1) + ) lsu_reg ( + .clk (clk), + .reset (reset), + .stall (~lsu_req_if.ready), + .flush (stall && lsu_req_if.ready), + .in ({lsu_req_tmp_if.valid, lsu_req_tmp_if.issue_tag, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.thread_mask, lsu_req_tmp_if.rw, lsu_req_tmp_if.byteen, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset, lsu_req_tmp_if.store_data, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb}), + .out ({lsu_req_if.valid, lsu_req_if.issue_tag, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data, lsu_req_if.rd, lsu_req_if.wb}) + ); + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `CSR_BITS + `CSR_ADDR_BITS + 32 + 1) + ) csr_reg ( + .clk (clk), + .reset (reset), + .stall (~csr_req_if.ready), + .flush (stall && csr_req_if.ready), + .in ({csr_req_tmp_if.valid, csr_req_tmp_if.issue_tag, csr_req_tmp_if.warp_num, csr_req_tmp_if.curr_PC, csr_req_tmp_if.thread_mask, csr_req_tmp_if.csr_op, csr_req_tmp_if.csr_addr, csr_req_tmp_if.csr_mask, csr_req_tmp_if.is_io}), + .out ({csr_req_if.valid, csr_req_if.issue_tag, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.thread_mask, csr_req_if.csr_op, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io}) + ); + +`ifdef EXT_M_ENABLE + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `MUL_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) + ) mul_reg ( + .clk (clk), + .reset (reset), + .stall (~mul_req_if.ready), + .flush (stall && mul_req_if.ready), + .in ({mul_req_tmp_if.valid, mul_req_tmp_if.issue_tag, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.thread_mask, mul_req_tmp_if.mul_op, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data}), + .out ({mul_req_if.valid, mul_req_if.issue_tag, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.thread_mask, mul_req_if.mul_op, mul_req_if.rs1_data, mul_req_if.rs2_data}) + ); +`endif + +`ifdef EXT_F_ENABLE + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `FPU_BITS + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) + ) fpu_reg ( + .clk (clk), + .reset (reset), + .stall (~fpu_req_if.ready), + .flush (stall && fpu_req_if.ready), + .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.issue_tag, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.thread_mask, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.frm, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data}), + .out ({fpu_req_if.valid, fpu_req_if.issue_tag, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.thread_mask, fpu_req_if.fpu_op, fpu_req_if.frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}) + ); +`endif + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `GPU_BITS + (`NUM_THREADS * 32) + 32 + 32) + ) gpu_reg ( + .clk (clk), + .reset (reset), + .stall (~gpu_req_if.ready), + .flush (stall && gpu_req_if.ready), + .in ({gpu_req_tmp_if.valid, gpu_req_tmp_if.issue_tag, gpu_req_tmp_if.warp_num, gpu_req_tmp_if.curr_PC, gpu_req_tmp_if.thread_mask, gpu_req_tmp_if.gpu_op, gpu_req_tmp_if.rs1_data, gpu_req_tmp_if.rs2_data, gpu_req_tmp_if.next_PC}), + .out ({gpu_req_if.valid, gpu_req_if.issue_tag, gpu_req_if.warp_num, gpu_req_if.curr_PC, gpu_req_if.thread_mask, gpu_req_if.gpu_op, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.next_PC}) + ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%d, rd=%0d, rs1_data=%0h, rs2_data=%0h, offset=%0h, next_PC=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, offset=%0h, next_PC=%0h", $time, CORE_ID, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.issue_tag, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC); end if (lsu_req_if.valid && lsu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0b, rd=%0d, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.issue_tag, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); end if (csr_req_if.valid && csr_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%d, rd=%0d, addr=%0h, mask=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.issue_tag, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask); end if (mul_req_if.valid && mul_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%d, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.issue_tag, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data); end if (fpu_req_if.valid && fpu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%d, rd=%0d, frm=%0h, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, fpu_req_if.frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.issue_tag, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); end if (gpu_req_if.valid && gpu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, issue_tmp_tag, decode_tmp_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); + $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.warp_num, gpu_req_if.curr_PC, gpu_req_if.issue_tag, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); end end `endif diff --git a/hw/rtl/VX_issue_demux.v b/hw/rtl/VX_issue_demux.v index 5e5a9672..44c4433b 100644 --- a/hw/rtl/VX_issue_demux.v +++ b/hw/rtl/VX_issue_demux.v @@ -32,11 +32,11 @@ module VX_issue_demux ( assign lsu_req_if.issue_tag = issue_tag; assign lsu_req_if.warp_num = decode_if.warp_num; assign lsu_req_if.curr_PC = decode_if.curr_PC; + assign lsu_req_if.rw = `LSU_RW(decode_if.ex_op); + assign lsu_req_if.byteen = `LSU_BE(decode_if.ex_op); assign lsu_req_if.base_addr = gpr_read_if.rs1_data; assign lsu_req_if.store_data = gpr_read_if.rs2_data; assign lsu_req_if.offset = decode_if.imm; - assign lsu_req_if.rw = `LSU_RW(decode_if.ex_op); - assign lsu_req_if.byteen = `LSU_BE(decode_if.ex_op); assign lsu_req_if.rd = decode_if.rd; assign lsu_req_if.wb = decode_if.wb; @@ -54,6 +54,8 @@ module VX_issue_demux ( `ifdef EXT_M_ENABLE assign mul_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_MUL); assign mul_req_if.issue_tag = issue_tag; + assign mul_req_if.warp_num = decode_if.warp_num; + assign mul_req_if.curr_PC = decode_if.curr_PC; assign mul_req_if.mul_op = `MUL_OP(decode_if.ex_op); assign mul_req_if.rs1_data = gpr_read_if.rs1_data; assign mul_req_if.rs2_data = gpr_read_if.rs2_data; @@ -64,11 +66,12 @@ module VX_issue_demux ( assign fpu_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_FPU); assign fpu_req_if.issue_tag = issue_tag; assign fpu_req_if.warp_num = decode_if.warp_num; + assign fpu_req_if.curr_PC = decode_if.curr_PC; assign fpu_req_if.fpu_op = `FPU_OP(decode_if.ex_op); + assign fpu_req_if.frm = decode_if.frm; assign fpu_req_if.rs1_data = gpr_read_if.rs1_data; assign fpu_req_if.rs2_data = gpr_read_if.rs2_data; - assign fpu_req_if.rs3_data = gpr_read_if.rs3_data; - assign fpu_req_if.frm = decode_if.frm; + assign fpu_req_if.rs3_data = gpr_read_if.rs3_data; `endif // GPU unit @@ -76,6 +79,7 @@ module VX_issue_demux ( assign gpu_req_if.thread_mask = decode_if.thread_mask; assign gpu_req_if.issue_tag = issue_tag; assign gpu_req_if.warp_num = decode_if.warp_num; + assign gpu_req_if.curr_PC = decode_if.curr_PC; assign gpu_req_if.gpu_op = `GPU_OP(decode_if.ex_op); assign gpu_req_if.rs1_data = gpr_read_if.rs1_data; assign gpu_req_if.rs2_data = gpr_read_if.rs2_data[0]; diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index bf086fbd..87acc21b 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -68,12 +68,12 @@ module VX_lsu_unit #( assign mem_req_offset[i] = full_address[i][1:0]; assign mem_req_byteen[i] = wmask << full_address[i][1:0]; assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0}; - end + end - wire stall_in = ~dcache_req_if.ready; + wire stall_in = ~dcache_req_if.ready && use_valid; // Can accept new request? - assign lsu_req_if.ready = ~stall_in; + assign lsu_req_if.ready = ~stall_in; `IGNORE_WARNINGS_BEGIN wire [`NUM_THREADS-1:0][31:0] use_address; @@ -167,20 +167,20 @@ module VX_lsu_unit #( assign dcache_rsp_if.ready = lsu_commit_if.ready && ~is_store_rsp; // STORE has priority // scope registration - `SCOPE_ASSIGN(scope_dcache_req_valid, dcache_req_if.valid); - `SCOPE_ASSIGN(scope_dcache_req_addr, use_address); - `SCOPE_ASSIGN(scope_dcache_req_rw, dcache_req_if.rw ); - `SCOPE_ASSIGN(scope_dcache_req_byteen,dcache_req_if.byteen); - `SCOPE_ASSIGN(scope_dcache_req_data, dcache_req_if.data); - `SCOPE_ASSIGN(scope_dcache_req_tag, dcache_req_if.tag); - `SCOPE_ASSIGN(scope_dcache_req_ready, dcache_req_if.ready); - `SCOPE_ASSIGN(scope_dcache_req_warp_num, use_warp_num); - `SCOPE_ASSIGN(scope_dcache_req_curr_PC, use_pc); + `SCOPE_ASSIGN (scope_dcache_req_valid, dcache_req_if.valid); + `SCOPE_ASSIGN (scope_dcache_req_addr, use_address); + `SCOPE_ASSIGN (scope_dcache_req_rw, dcache_req_if.rw ); + `SCOPE_ASSIGN (scope_dcache_req_byteen,dcache_req_if.byteen); + `SCOPE_ASSIGN (scope_dcache_req_data, dcache_req_if.data); + `SCOPE_ASSIGN (scope_dcache_req_tag, dcache_req_if.tag); + `SCOPE_ASSIGN (scope_dcache_req_ready, dcache_req_if.ready); + `SCOPE_ASSIGN (scope_dcache_req_warp_num, use_warp_num); + `SCOPE_ASSIGN (scope_dcache_req_curr_PC, use_pc); - `SCOPE_ASSIGN(scope_dcache_rsp_valid, dcache_rsp_if.valid); - `SCOPE_ASSIGN(scope_dcache_rsp_data, dcache_rsp_if.data); - `SCOPE_ASSIGN(scope_dcache_rsp_tag, dcache_rsp_if.tag); - `SCOPE_ASSIGN(scope_dcache_rsp_ready, dcache_rsp_if.ready); + `SCOPE_ASSIGN (scope_dcache_rsp_valid, dcache_rsp_if.valid); + `SCOPE_ASSIGN (scope_dcache_rsp_data, dcache_rsp_if.data); + `SCOPE_ASSIGN (scope_dcache_rsp_tag, dcache_rsp_if.tag); + `SCOPE_ASSIGN (scope_dcache_rsp_ready, dcache_rsp_if.ready); `UNUSED_VAR (mem_rsp_warp_num) `UNUSED_VAR (mem_rsp_curr_PC) diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 36b46bba..394ebe06 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -21,13 +21,13 @@ module VX_mul_unit #( wire stall_mul, stall_div; - wire is_mul_op = (alu_op == `MUL_MUL); - wire is_div_op = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); - - reg [`NUM_THREADS-1:0] is_div_op_in; - wire [`NUM_THREADS-1:0] is_div_op_out; - wire is_mul_op_out; + wire is_mul_mul = (alu_op == `MUL_MUL); + wire is_mul_mul_out; + wire is_div_divu = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); + reg [`NUM_THREADS-1:0] is_div_divu_qual; + wire [`NUM_THREADS-1:0] is_div_divu_out; + genvar i; for (i = 0; i < `NUM_THREADS; i++) begin @@ -39,16 +39,16 @@ module VX_mul_unit #( // handle divide by zero always @(*) begin - is_div_op_in[i] = is_div_op; + is_div_divu_qual[i] = is_div_divu; div_in1 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in1[i][31], alu_in1[i]}; div_in2 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in2[i][31], alu_in2[i]}; if (0 == alu_in2[i]) begin - if (is_div_op) begin + if (is_div_divu) begin div_in1 = {1'b0, 32'hFFFFFFFF}; // quotient = (0xFFFFFFFF / 1) div_in2 = 1; end else begin - is_div_op_in[i] = 1; // remainder = (in1 / 1) + is_div_divu_qual[i] = 1; // remainder = (in1 / 1) div_in2 = 1; end end @@ -91,10 +91,13 @@ module VX_mul_unit #( .remainder(rem_result_tmp) ); - assign mul_result[i] = is_mul_op_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; - assign div_result[i] = is_div_op_out[i] ? div_result_tmp : rem_result_tmp; + assign mul_result[i] = is_mul_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; + assign div_result[i] = is_div_divu_out[i] ? div_result_tmp : rem_result_tmp; end + wire is_mul_fire = alu_req_if.valid && alu_req_if.ready && ~`IS_DIV_OP(alu_op); + wire is_div_fire = alu_req_if.valid && alu_req_if.ready && `IS_DIV_OP(alu_op); + wire mul_valid_out; wire div_valid_out; @@ -108,8 +111,8 @@ module VX_mul_unit #( .clk(clk), .reset(reset), .enable(~stall_mul), - .in({alu_req_if.valid && ~`IS_DIV_OP(alu_op), alu_req_if.issue_tag, is_mul_op}), - .out({mul_valid_out, mul_issue_tag, is_mul_op_out}) + .in({is_mul_fire, alu_req_if.issue_tag, is_mul_mul}), + .out({mul_valid_out, mul_issue_tag, is_mul_mul_out}) ); VX_shift_register #( @@ -119,8 +122,8 @@ module VX_mul_unit #( .clk(clk), .reset(reset), .enable(~stall_div), - .in({alu_req_if.valid && `IS_DIV_OP(alu_op), alu_req_if.issue_tag, is_div_op_in}), - .out({div_valid_out, div_issue_tag, is_div_op_out}) + .in({is_div_fire, alu_req_if.issue_tag, is_div_divu_qual}), + .out({div_valid_out, div_issue_tag, is_div_divu_out}) ); wire stall_out = (~alu_commit_if.ready && alu_commit_if.valid); diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 79219a1a..efc66237 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -245,10 +245,10 @@ module VX_pipeline #( assign core_icache_rsp_if.tag = icache_rsp_tag; assign icache_rsp_ready = core_icache_rsp_if.ready; - `SCOPE_ASSIGN(scope_busy, busy); - `SCOPE_ASSIGN(scope_schedule_delay, schedule_delay); - `SCOPE_ASSIGN(scope_mem_delay, mem_delay); - `SCOPE_ASSIGN(scope_exec_delay, exec_delay); - `SCOPE_ASSIGN(scope_gpr_stage_delay, gpr_delay); + `SCOPE_ASSIGN (scope_busy, busy); + `SCOPE_ASSIGN (scope_schedule_delay, schedule_delay); + `SCOPE_ASSIGN (scope_mem_delay, mem_delay); + `SCOPE_ASSIGN (scope_exec_delay, exec_delay); + `SCOPE_ASSIGN (scope_gpr_stage_delay, gpr_delay); endmodule diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index d5c9ed08..a3bfa32a 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -10,11 +10,10 @@ module VX_scheduler #( VX_wb_if writeback_if, VX_cmt_to_issue_if cmt_to_issue_if, input wire ex_busy, - input wire gpr_busy, output wire [`ISTAG_BITS-1:0] issue_tag, output wire schedule_delay ); - localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); + localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; @@ -23,13 +22,13 @@ module VX_scheduler #( wire issue_buf_full; - wire stall = gpr_busy || ex_busy || inuse_hazard || issue_buf_full; + assign schedule_delay = ex_busy || inuse_hazard || issue_buf_full; - wire issue_fire = decode_if.valid && ~stall; + wire issue_fire = decode_if.valid && decode_if.ready; + + wire writeback_fire = writeback_if.valid && writeback_if.ready; wire acquire_rd = issue_fire && (decode_if.wb != 0); - - wire release_rd = writeback_if.valid; wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[{writeback_if.warp_num, writeback_if.rd}] & ~writeback_if.thread_mask; @@ -46,7 +45,7 @@ module VX_scheduler #( inuse_registers[{decode_if.warp_num, decode_if.rd}] <= decode_if.thread_mask; inuse_reg_mask[decode_if.warp_num][decode_if.rd] <= 1; end - if (release_rd) begin + if (writeback_fire) begin assert(inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] != 0); inuse_registers[{writeback_if.warp_num, writeback_if.rd}] <= inuse_registers_n; inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] <= (| inuse_registers_n); @@ -59,25 +58,23 @@ module VX_scheduler #( .SIZE (`ISSUEQ_SIZE), .RPORTS (`NUM_EXS) ) issue_buffer ( - .clk (clk), - .reset (reset), - .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), - .write_addr (issue_tag), - .acquire_slot (issue_fire), - .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), - .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), - .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), - .full (issue_buf_full) + .clk (clk), + .reset (reset), + .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), + .write_addr (issue_tag), + .acquire_slot (issue_fire), + .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), + .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), + .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), + .full (issue_buf_full) ); - assign schedule_delay = stall; - `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if (decode_if.valid && stall) begin - $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b, gpr_busy=%b", + if (decode_if.valid && ~decode_if.ready) begin + $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, - inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy, gpr_busy); + inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy); end end `endif diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 2da93479..717e9cf4 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -92,6 +92,15 @@ module VX_writeback #( wb_warp_num_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.warp_num; wb_curr_PC_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.curr_PC; wb_rd_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.rd; + end + + if (gpu_commit_if.valid) begin + wb_valid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wb; + wb_thread_mask_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.thread_mask; + wb_data_table_n [gpu_commit_if.issue_tag] = gpu_commit_if.data; + wb_warp_num_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.warp_num; + wb_curr_PC_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.curr_PC; + wb_rd_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.rd; end end diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 57992793..73f955a9 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -763,18 +763,18 @@ module VX_bank #( end `endif -`SCOPE_ASSIGN(scope_bank_valid_st0, qual_valid_st0); -`SCOPE_ASSIGN(scope_bank_valid_st1, valid_st1e); -`SCOPE_ASSIGN(scope_bank_valid_st2, valid_st2); +`SCOPE_ASSIGN (scope_bank_valid_st0, qual_valid_st0); +`SCOPE_ASSIGN (scope_bank_valid_st1, valid_st1e); +`SCOPE_ASSIGN (scope_bank_valid_st2, valid_st2); -`SCOPE_ASSIGN(scope_bank_is_mrvq_st1, is_mrvq_st1e); -`SCOPE_ASSIGN(scope_bank_miss_st1, miss_st1e); -`SCOPE_ASSIGN(scope_bank_dirty_st1, dirty_st1e); -`SCOPE_ASSIGN(scope_bank_force_miss_st1, force_request_miss_st1e); -`SCOPE_ASSIGN(scope_bank_stall_pipe, stall_bank_pipe); +`SCOPE_ASSIGN (scope_bank_is_mrvq_st1, is_mrvq_st1e); +`SCOPE_ASSIGN (scope_bank_miss_st1, miss_st1e); +`SCOPE_ASSIGN (scope_bank_dirty_st1, dirty_st1e); +`SCOPE_ASSIGN (scope_bank_force_miss_st1, force_request_miss_st1e); +`SCOPE_ASSIGN (scope_bank_stall_pipe, stall_bank_pipe); -`SCOPE_ASSIGN(scope_bank_addr_st0, `LINE_TO_BYTE_ADDR(qual_addr_st0, BANK_ID)); -`SCOPE_ASSIGN(scope_bank_addr_st1, `LINE_TO_BYTE_ADDR(addr_st1e, BANK_ID)); -`SCOPE_ASSIGN(scope_bank_addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); +`SCOPE_ASSIGN (scope_bank_addr_st0, `LINE_TO_BYTE_ADDR(qual_addr_st0, BANK_ID)); +`SCOPE_ASSIGN (scope_bank_addr_st1, `LINE_TO_BYTE_ADDR(addr_st1e, BANK_ID)); +`SCOPE_ASSIGN (scope_bank_addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); endmodule diff --git a/hw/rtl/cache/VX_tag_data_store.v b/hw/rtl/cache/VX_tag_data_store.v index 72a62a7b..e0f356cc 100644 --- a/hw/rtl/cache/VX_tag_data_store.v +++ b/hw/rtl/cache/VX_tag_data_store.v @@ -44,10 +44,9 @@ module VX_tag_data_store #( wire do_write = (| write_enable); - integer i, j; always @(posedge clk) begin if (reset) begin - for (i = 0; i < `BANK_LINE_COUNT; i++) begin + for (integer i = 0; i < `BANK_LINE_COUNT; i++) begin valid[i] <= 0; dirty[i] <= 0; end @@ -71,10 +70,10 @@ module VX_tag_data_store #( valid[write_addr] <= 0; end - for (i = 0; i < `BANK_LINE_WORDS; i++) begin - for (j = 0; j < WORD_SIZE; j++) begin - if (write_enable[i][j]) begin - data[write_addr][i][j] <= write_data[i * `WORD_WIDTH + j * 8 +: 8]; + for (integer j = 0; j < `BANK_LINE_WORDS; j++) begin + for (integer i = 0; i < WORD_SIZE; i++) begin + if (write_enable[j][i]) begin + data[write_addr][j][i] <= write_data[j * `WORD_WIDTH + i * 8 +: 8]; end end end diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index e2142417..fdce1d67 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -5,10 +5,10 @@ module VX_fp_fpga ( input wire clk, input wire reset, - input wire in_valid, - output wire in_ready, + input wire valid_in, + output wire ready_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, @@ -21,21 +21,22 @@ module VX_fp_fpga ( output wire has_fflags, output fflags_t [`NUM_THREADS-1:0] fflags, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); localparam NUM_FPC = 12; localparam FPC_BITS = `LOG2UP(NUM_FPC); - wire [NUM_FPC-1:0] core_in_ready; - wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] core_result; + wire [NUM_FPC-1:0] per_core_ready_in; + wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result; + wire [NUM_FPC-1:0][`ISTAG_BITS-1:0] per_core_tag_out; + wire [NUM_FPC-1:0] per_core_ready_out; + wire [NUM_FPC-1:0] per_core_valid_out; + wire fpnew_has_fflags; fflags_t fpnew_fflags; - wire [NUM_FPC-1:0][`ISTAG_BITS-1:0] core_out_tag; - wire [NUM_FPC-1:0] core_out_ready; - wire [NUM_FPC-1:0] core_out_valid; reg [FPC_BITS-1:0] core_select; reg fmadd_negate; @@ -66,172 +67,172 @@ module VX_fp_fpga ( VX_fp_noncomp fp_noncomp ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 0)), - .in_ready (core_in_ready[0]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 0)), + .ready_in (per_core_ready_in[0]), + .tag_in (tag_in), .op (op), .frm (frm), .dataa (dataa), .datab (datab), - .result (core_result[0]), + .result (per_core_result[0]), .has_fflags (fpnew_has_fflags), .fflags (fpnew_fflags), - .out_tag (core_out_tag[0]), - .out_ready (core_out_ready[0]), - .out_valid (core_out_valid[0]) + .tag_out (per_core_tag_out[0]), + .ready_out (per_core_ready_out[0]), + .valid_out (per_core_valid_out[0]) ); VX_fp_add fp_add ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 1)), - .in_ready (core_in_ready[1]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 1)), + .ready_in (per_core_ready_in[1]), + .tag_in (tag_in), .dataa (dataa), .datab (datab), - .result (core_result[1]), - .out_tag (core_out_tag[1]), - .out_ready (core_out_ready[1]), - .out_valid (core_out_valid[1]) + .result (per_core_result[1]), + .tag_out (per_core_tag_out[1]), + .ready_out (per_core_ready_out[1]), + .valid_out (per_core_valid_out[1]) ); VX_fp_sub fp_sub ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 2)), - .in_ready (core_in_ready[2]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 2)), + .ready_in (per_core_ready_in[2]), + .tag_in (tag_in), .dataa (dataa), .datab (datab), - .result (core_result[2]), - .out_tag (core_out_tag[2]), - .out_ready (core_out_ready[2]), - .out_valid (core_out_valid[2]) + .result (per_core_result[2]), + .tag_out (per_core_tag_out[2]), + .ready_out (per_core_ready_out[2]), + .valid_out (per_core_valid_out[2]) ); VX_fp_mul fp_mul ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 3)), - .in_ready (core_in_ready[3]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 3)), + .ready_in (per_core_ready_in[3]), + .tag_in (tag_in), .dataa (dataa), .datab (datab), - .result (core_result[3]), - .out_tag (core_out_tag[3]), - .out_ready (core_out_ready[3]), - .out_valid (core_out_valid[3]) + .result (per_core_result[3]), + .tag_out (per_core_tag_out[3]), + .ready_out (per_core_ready_out[3]), + .valid_out (per_core_valid_out[3]) ); VX_fp_madd fp_madd ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 4)), - .in_ready (core_in_ready[4]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 4)), + .ready_in (per_core_ready_in[4]), + .tag_in (tag_in), .negate (fmadd_negate), .dataa (dataa), .datab (datab), .datac (datac), - .result (core_result[4]), - .out_tag (core_out_tag[4]), - .out_ready (core_out_ready[4]), - .out_valid (core_out_valid[4]) + .result (per_core_result[4]), + .tag_out (per_core_tag_out[4]), + .ready_out (per_core_ready_out[4]), + .valid_out (per_core_valid_out[4]) ); VX_fp_msub fp_msub ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 5)), - .in_ready (core_in_ready[5]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 5)), + .ready_in (per_core_ready_in[5]), + .tag_in (tag_in), .negate (fmadd_negate), .dataa (dataa), .datab (datab), .datac (datac), - .result (core_result[5]), - .out_tag (core_out_tag[5]), - .out_ready (core_out_ready[5]), - .out_valid (core_out_valid[5]) + .result (per_core_result[5]), + .tag_out (per_core_tag_out[5]), + .ready_out (per_core_ready_out[5]), + .valid_out (per_core_valid_out[5]) ); VX_fp_div fp_div ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 6)), - .in_ready (core_in_ready[6]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 6)), + .ready_in (per_core_ready_in[6]), + .tag_in (tag_in), .dataa (dataa), .datab (datab), - .result (core_result[6]), - .out_tag (core_out_tag[6]), - .out_ready (core_out_ready[6]), - .out_valid (core_out_valid[6]) + .result (per_core_result[6]), + .tag_out (per_core_tag_out[6]), + .ready_out (per_core_ready_out[6]), + .valid_out (per_core_valid_out[6]) ); VX_fp_sqrt fp_sqrt ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 7)), - .in_ready (core_in_ready[7]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 7)), + .ready_in (per_core_ready_in[7]), + .tag_in (tag_in), .dataa (dataa), - .result (core_result[7]), - .out_tag (core_out_tag[7]), - .out_ready (core_out_ready[7]), - .out_valid (core_out_valid[7]) + .result (per_core_result[7]), + .tag_out (per_core_tag_out[7]), + .ready_out (per_core_ready_out[7]), + .valid_out (per_core_valid_out[7]) ); VX_fp_ftoi fp_ftoi ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 8)), - .in_ready (core_in_ready[8]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 8)), + .ready_in (per_core_ready_in[8]), + .tag_in (tag_in), .dataa (dataa), - .result (core_result[8]), - .out_tag (core_out_tag[8]), - .out_ready (core_out_ready[8]), - .out_valid (core_out_valid[8]) + .result (per_core_result[8]), + .tag_out (per_core_tag_out[8]), + .ready_out (per_core_ready_out[8]), + .valid_out (per_core_valid_out[8]) ); VX_fp_ftou fp_ftou ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 9)), - .in_ready (core_in_ready[9]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 9)), + .ready_in (per_core_ready_in[9]), + .tag_in (tag_in), .dataa (dataa), - .result (core_result[9]), - .out_tag (core_out_tag[9]), - .out_ready (core_out_ready[9]), - .out_valid (core_out_valid[9]) + .result (per_core_result[9]), + .tag_out (per_core_tag_out[9]), + .ready_out (per_core_ready_out[9]), + .valid_out (per_core_valid_out[9]) ); VX_fp_itof fp_itof ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 10)), - .in_ready (core_in_ready[10]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 10)), + .ready_in (per_core_ready_in[10]), + .tag_in (tag_in), .dataa (dataa), - .result (core_result[10]), - .out_tag (core_out_tag[10]), - .out_ready (core_out_ready[10]), - .out_valid (core_out_valid[10]) + .result (per_core_result[10]), + .tag_out (per_core_tag_out[10]), + .ready_out (per_core_ready_out[10]), + .valid_out (per_core_valid_out[10]) ); VX_fp_utof fp_utof ( .clk (clk), .reset (reset), - .in_valid (in_valid && (core_select == 11)), - .in_ready (core_in_ready[11]), - .in_tag (in_tag), + .valid_in (valid_in && (core_select == 11)), + .ready_in (per_core_ready_in[11]), + .tag_in (tag_in), .dataa (dataa), - .result (core_result[11]), - .out_tag (core_out_tag[11]), - .out_ready (core_out_ready[11]), - .out_valid (core_out_valid[11]) + .result (per_core_result[11]), + .tag_out (per_core_tag_out[11]), + .ready_out (per_core_ready_out[11]), + .valid_out (per_core_valid_out[11]) ); wire [FPC_BITS-1:0] fp_index; @@ -240,18 +241,18 @@ module VX_fp_fpga ( VX_priority_encoder #( .N(NUM_FPC) ) wb_select ( - .data_in (core_out_valid), + .data_in (per_core_valid_out), .data_out (fp_index), .valid_out (fp_valid) ); for (i = 0; i < NUM_FPC; i++) begin - assign core_out_ready[i] = out_ready && (i == fp_index); + assign per_core_ready_out[i] = ready_out && (i == fp_index); end wire tmp_valid = fp_valid; - wire [`ISTAG_BITS-1:0] tmp_tag = core_out_tag[fp_index]; - wire [`NUM_THREADS-1:0][31:0] tmp_result = core_result[fp_index]; + wire [`ISTAG_BITS-1:0] tmp_tag = per_core_tag_out[fp_index]; + wire [`NUM_THREADS-1:0][31:0] tmp_result = per_core_result[fp_index]; wire tmp_has_fflags = fpnew_has_fflags && (fp_index == 0); fflags_t [`NUM_THREADS-1:0] tmp_flags = fpnew_fflags; @@ -263,7 +264,7 @@ module VX_fp_fpga ( .stall (stall), .flush (1'b0), .in ({tmp_valid, tmp_tag, tmp_result, tmp_has_fflags, tmp_fflags}), - .out ({out_valid, out_tag, result, has_fflags, fflags}) + .out ({valid_out, tag_out, result, has_fflags, fflags}) ); endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index 4bcde8c4..4ae4c047 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -4,10 +4,10 @@ module VX_fp_noncomp ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, @@ -19,10 +19,10 @@ module VX_fp_noncomp ( output wire has_fflags, output fflags_t [`NUM_THREADS-1:0] fflags, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); localparam NEG_INF = 32'h00000001, NEG_NORM = 32'h00000002, @@ -226,8 +226,8 @@ module VX_fp_noncomp ( end end - wire stall = ~out_ready && out_valid; - assign in_ready = ~stall; + wire stall = ~ready_out && valid_out; + assign ready_in = ~stall; VX_generic_register #( .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32) + 1 + (`NUM_THREADS * `FFG_BITS)) @@ -236,8 +236,8 @@ module VX_fp_noncomp ( .reset (reset), .stall (stall), .flush (1'b0), - .in ({tmp_valid, in_tag, tmp_result, tmp_has_fflags, tmp_fflags}), - .out ({out_valid, out_tag, result, has_fflags, fflags}) + .in ({tmp_valid, tag_in, tmp_result, tmp_has_fflags, tmp_fflags}), + .out ({valid_out, tag_out, result, has_fflags, fflags}) ); endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index 71668082..bcd27376 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -11,10 +11,10 @@ module VX_fpnew #( input wire clk, input wire reset, - input wire in_valid, - output wire in_ready, + input wire valid_in, + output wire ready_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, @@ -27,10 +27,10 @@ module VX_fpnew #( output wire has_fflags, output fflags_t [`NUM_THREADS-1:0] fflags, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); localparam UNIT_FMULADD = FMULADD ? fpnew_pkg::PARALLEL : fpnew_pkg::DISABLED; localparam UNIT_FDIVSQRT = FDIVSQRT ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED; @@ -56,17 +56,17 @@ module VX_fpnew #( '{default: `LATENCY_FDIVSQRT}, // DIVSQRT '{default: `LATENCY_FNONCOMP}, // NONCOMP '{default: `LATENCY_FCONV}}, // CONV - UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL + UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL '{default: UNIT_FDIVSQRT}, // DIVSQRT '{default: UNIT_FNONCOMP}, // NONCOMP '{default: UNIT_FCONV}}, // CONV PipeConfig: fpnew_pkg::DISTRIBUTED }; - wire fpu_in_ready, fpu_in_valid; - wire fpu_out_ready, fpu_out_valid; + wire fpu_ready_in, fpu_valid_in; + wire fpu_ready_out, fpu_valid_out; - reg [`ISTAG_BITS-1:0] fpu_in_tag, fpu_out_tag; + reg [`ISTAG_BITS-1:0] fpu_tag_in, fpu_tag_out; reg [2:0][`NUM_THREADS-1:0][31:0] fpu_operands; @@ -77,13 +77,13 @@ module VX_fpnew #( wire [`NUM_THREADS-1:0][31:0] fpu_result; fpnew_pkg::status_t [0:`NUM_THREADS-1] fpu_status; - wire is_class_op_i, is_class_op_o; - assign is_class_op_i = (op == `FPU_CLASS); + wire is_class_op, is_class_op_out; + assign is_class_op = (op == `FPU_CLASS); reg [FOP_BITS-1:0] fpu_op; reg [`FRM_BITS-1:0] fpu_rnd; reg fpu_op_mod; - reg fpu_has_fflags, fpu_has_fflags_o; + reg fpu_has_fflags, fpu_has_fflags_out; always @(*) begin fpu_op = fpnew_pkg::SGNJ; @@ -150,15 +150,15 @@ module VX_fpnew #( .dst_fmt_i (fpnew_pkg::fp_format_e'(fpu_dst_fmt)), .int_fmt_i (fpnew_pkg::int_format_e'(fpu_int_fmt)), .vectorial_op_i (1'b0), - .tag_i ({fpu_in_tag, fpu_has_fflags, is_class_op_i}), - .in_valid_i (fpu_in_valid), - .in_ready_o (fpu_in_ready), + .tag_i ({fpu_tag_in, fpu_has_fflags, is_class_op}), + .in_valid_i (fpu_valid_in), + .in_ready_o (fpu_ready_in), .flush_i (reset), .result_o (fpu_result[0]), .status_o (fpu_status[0]), - .tag_o ({fpu_out_tag, fpu_has_fflags_o, is_class_op_o}), - .out_valid_o (fpu_out_valid), - .out_ready_i (fpu_out_ready), + .tag_o ({fpu_tag_out, fpu_has_fflags_out, is_class_op_out}), + .out_valid_o (fpu_valid_out), + .out_ready_i (fpu_ready_out), `UNUSED_PIN (busy_o) ); end else begin @@ -178,14 +178,14 @@ module VX_fpnew #( .int_fmt_i (fpnew_pkg::int_format_e'(fpu_int_fmt)), .vectorial_op_i (1'b0), .tag_i (1'b0), - .in_valid_i (fpu_in_valid), + .in_valid_i (fpu_valid_in), `UNUSED_PIN (in_ready_o), .flush_i (reset), .result_o (fpu_result[i]), .status_o (fpu_status[i]), `UNUSED_PIN (tag_o), `UNUSED_PIN (out_valid_o), - .out_ready_i (fpu_out_ready), + .out_ready_i (fpu_ready_out), `UNUSED_PIN (busy_o) ); end @@ -193,19 +193,19 @@ module VX_fpnew #( `ENABLE_TRACING - assign fpu_in_valid = in_valid; - assign in_ready = fpu_in_ready - || ~in_valid; // fix fpnews's in_ready containing in_valid; + assign fpu_valid_in = valid_in; + assign ready_in = fpu_ready_in + || ~valid_in; // fix - assign fpu_in_tag = in_tag; - assign out_tag = fpu_out_tag; + assign fpu_tag_in = tag_in; + assign tag_out = fpu_tag_out; assign result = fpu_result; - assign has_fflags = fpu_has_fflags_o; + assign has_fflags = fpu_has_fflags_out; assign fflags = fpu_status; - assign out_valid = fpu_out_valid; - assign fpu_out_ready = out_ready; + assign valid_out = fpu_valid_out; + assign fpu_ready_out = ready_out; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/VX_fp_add.v b/hw/rtl/fp_cores/altera/VX_fp_add.v index d06b80c2..e055adfa 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_add.v +++ b/hw/rtl/fp_cores/altera/VX_fp_add.v @@ -4,23 +4,23 @@ module VX_fp_add ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -73,8 +73,8 @@ module VX_fp_add ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_div.v b/hw/rtl/fp_cores/altera/VX_fp_div.v index a08c1087..a0db0790 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_div.v +++ b/hw/rtl/fp_cores/altera/VX_fp_div.v @@ -4,23 +4,23 @@ module VX_fp_div ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -42,8 +42,8 @@ module VX_fp_div ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v index b9e64db5..3036410a 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v @@ -4,22 +4,22 @@ module VX_fp_ftoi ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -40,8 +40,8 @@ module VX_fp_ftoi ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftou.v b/hw/rtl/fp_cores/altera/VX_fp_ftou.v index 6044c2e5..461d45df 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftou.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftou.v @@ -4,22 +4,22 @@ module VX_fp_ftou ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -40,8 +40,8 @@ module VX_fp_ftou ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_itof.v b/hw/rtl/fp_cores/altera/VX_fp_itof.v index 4c8f3188..d67749ad 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_itof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_itof.v @@ -4,22 +4,22 @@ module VX_fp_itof ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -40,8 +40,8 @@ module VX_fp_itof ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_madd.v b/hw/rtl/fp_cores/altera/VX_fp_madd.v index b6058e98..c7939cd7 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_madd.v +++ b/hw/rtl/fp_cores/altera/VX_fp_madd.v @@ -4,10 +4,10 @@ module VX_fp_madd ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, @@ -16,13 +16,13 @@ module VX_fp_madd ( input wire negate, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); wire enable0, enable1; - assign in_ready = enable0 && enable1; + assign ready_in = enable0 && enable1; wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1; wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; @@ -119,7 +119,7 @@ module VX_fp_madd ( .clk(clk), .reset(reset), .enable(enable0), - .in({in_tag, (in_valid && ~negate), (in_valid && negate)}), + .in ({tag_in, (valid_in && ~negate), (valid_in && negate)}), .out({out_tag_st0, out_valid_st0, in_valid_st0}) ); @@ -134,12 +134,12 @@ module VX_fp_madd ( .out({out_tag_st1, out_valid_st1}) ); - wire out_stall = ~out_ready && out_valid; + wire out_stall = ~ready_out && valid_out; assign enable0 = ~out_stall; assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs assign result = out_valid_st0 ? result_st0 : result_st1; - assign out_tag = out_valid_st0 ? out_tag_st0 : out_tag_st1; - assign out_valid = out_valid_st0 || out_valid_st1; + assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1; + assign valid_out = out_valid_st0 || out_valid_st1; endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_msub.v b/hw/rtl/fp_cores/altera/VX_fp_msub.v index 83a499d2..211c1b34 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_msub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_msub.v @@ -4,10 +4,10 @@ module VX_fp_msub ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, @@ -16,13 +16,13 @@ module VX_fp_msub ( input wire negate, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); wire enable0, enable1; - assign in_ready = enable0 && enable1; + assign ready_in = enable0 && enable1; wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1; wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; @@ -119,7 +119,7 @@ module VX_fp_msub ( .clk(clk), .reset(reset), .enable(enable0), - .in({in_tag, (in_valid && ~negate), (in_valid && negate)}), + .in ({tag_in, (valid_in && ~negate), (valid_in && negate)}), .out({out_tag_st0, out_valid_st0, in_valid_st0}) ); @@ -134,12 +134,12 @@ module VX_fp_msub ( .out({out_tag_st1, out_valid_st1}) ); - wire out_stall = ~out_ready && out_valid; + wire out_stall = ~ready_out && valid_out; assign enable0 = ~out_stall; assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs assign result = out_valid_st0 ? result_st0 : result_st1; - assign out_tag = out_valid_st0 ? out_tag_st0 : out_tag_st1; - assign out_valid = out_valid_st0 || out_valid_st1; + assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1; + assign valid_out = out_valid_st0 || out_valid_st1; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/VX_fp_mul.v b/hw/rtl/fp_cores/altera/VX_fp_mul.v index 76709969..56633586 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_mul.v +++ b/hw/rtl/fp_cores/altera/VX_fp_mul.v @@ -4,23 +4,23 @@ module VX_fp_mul ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -73,8 +73,8 @@ module VX_fp_mul ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v index 0a57adc7..22649771 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v @@ -4,22 +4,22 @@ module VX_fp_sqrt ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -40,8 +40,8 @@ module VX_fp_sqrt ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_sub.v b/hw/rtl/fp_cores/altera/VX_fp_sub.v index 986c7bf0..f88567da 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sub.v @@ -4,23 +4,23 @@ module VX_fp_sub ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, input wire [`NUM_THREADS-1:0][31:0] datab, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -73,8 +73,8 @@ module VX_fp_sub ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_utof.v b/hw/rtl/fp_cores/altera/VX_fp_utof.v index e4169097..601c0634 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_utof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_utof.v @@ -4,22 +4,22 @@ module VX_fp_utof ( input wire clk, input wire reset, - output wire in_ready, - input wire in_valid, + output wire ready_in, + input wire valid_in, - input wire [`ISTAG_BITS-1:0] in_tag, + input wire [`ISTAG_BITS-1:0] tag_in, input wire [`NUM_THREADS-1:0][31:0] dataa, output wire [`NUM_THREADS-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] out_tag, + output wire [`ISTAG_BITS-1:0] tag_out, - input wire out_ready, - output wire out_valid + input wire ready_out, + output wire valid_out ); - wire stall = ~out_ready && out_valid; + wire stall = ~ready_out && valid_out; wire enable = ~stall; - assign in_ready = enable; + assign ready_in = enable; genvar i; @@ -40,8 +40,8 @@ module VX_fp_utof ( .clk(clk), .reset(reset), .enable(enable), - .in({in_tag, in_valid}), - .out({out_tag, out_valid}) + .in ({tag_in, valid_in}), + .out({tag_out, valid_out}) ); endmodule diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index b29cc649..1a727c7f 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -7,6 +7,9 @@ interface VX_csr_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; +`DEBUG_BEGIN + wire [`NUM_THREADS-1:0] thread_mask; +`DEBUG_END wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index dd03f271..2c1b299d 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -11,7 +11,11 @@ interface VX_fpu_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; +`DEBUG_BEGIN + wire [`NUM_THREADS-1:0] thread_mask; +`DEBUG_END wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; wire [`FPU_BITS-1:0] fpu_op; wire [`FRM_BITS-1:0] frm; diff --git a/hw/rtl/interfaces/VX_gpr_read_if.v b/hw/rtl/interfaces/VX_gpr_read_if.v index 27310af2..9b24ce56 100644 --- a/hw/rtl/interfaces/VX_gpr_read_if.v +++ b/hw/rtl/interfaces/VX_gpr_read_if.v @@ -19,8 +19,7 @@ interface VX_gpr_read_if (); wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; - wire in_ready; - wire out_ready; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 38f36ab0..604f5903 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -9,7 +9,9 @@ interface VX_gpu_req_if(); wire [`ISTAG_BITS-1:0] issue_tag; wire [`NUM_THREADS-1:0] thread_mask; wire [`NW_BITS-1:0] warp_num; - +`DEBUG_BEGIN + wire [31:0] curr_PC; +`DEBUG_END wire [`GPU_BITS-1:0] gpu_op; wire [`NUM_THREADS-1:0][31:0] rs1_data; diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 428edd94..44306bde 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -11,7 +11,11 @@ interface VX_mul_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; - +`DEBUG_BEGIN + wire [`NUM_THREADS-1:0] thread_mask; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; +`DEBUG_END wire [`MUL_BITS-1:0] mul_op; wire [`NUM_THREADS-1:0][31:0] rs1_data; diff --git a/hw/rtl/libs/VX_elastic_buffer.v b/hw/rtl/libs/VX_elastic_buffer.v new file mode 100644 index 00000000..c551fbc6 --- /dev/null +++ b/hw/rtl/libs/VX_elastic_buffer.v @@ -0,0 +1,38 @@ +`include "VX_platform.vh" + +module VX_elastic_buffer #( + parameter DATAW = 1, + parameter SIZE = 2, + parameter BUFFERED = 1 +) ( + input wire clk, + input wire reset, + input wire valid_in, + output wire ready_in, + input wire [DATAW-1:0] data_in, + output wire [DATAW-1:0] data_out, + input wire ready_out, + output wire valid_out +); + wire empty, full; + + VX_generic_queue #( + .DATAW (DATAW), + .SIZE (SIZE), + .BUFFERED (BUFFERED) + ) queue ( + .clk (clk), + .reset (reset), + .push (valid_in), + .pop (ready_out), + .data_in(data_in), + .data_out(data_out), + .empty (empty), + .full (full), + `UNUSED_PIN (size) + ); + + assign ready_in = ~full; + assign valid_out = ~empty; + +endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 10421745..5ee61e6e 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -2,7 +2,7 @@ module VX_generic_queue #( parameter DATAW = 1, - parameter SIZE = 16, + parameter SIZE = 2, parameter BUFFERED = 1 ) ( input wire clk,