diff --git a/tests/regression/idle/.gitignore b/tests/regression/idle/.gitignore new file mode 100644 index 00000000..c24ab099 --- /dev/null +++ b/tests/regression/idle/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +idle +.depend diff --git a/tests/regression/idle/Makefile b/tests/regression/idle/Makefile index 3a8ffb18..b5c78ec0 100644 --- a/tests/regression/idle/Makefile +++ b/tests/regression/idle/Makefile @@ -1,4 +1,4 @@ -PROJECT = sgemm_gemmini_dma +PROJECT = idle SRCS = main.cpp common.h diff --git a/tests/regression/idle/kernel.cpp b/tests/regression/idle/kernel.cpp index ccd9bcc5..48c5a1a7 100644 --- a/tests/regression/idle/kernel.cpp +++ b/tests/regression/idle/kernel.cpp @@ -7,7 +7,7 @@ #include "gemmini_mmio.h" #define NUM_CLUSTERS 1 -#define NUM_THREADS_IN_CLUSTER 256 +#define NUM_THREADS_IN_CLUSTER 512 #define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) @@ -22,9 +22,45 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { // reinterpret_cast(arg->addr_c)[0] = counter; // call barrier in a divergent branch, which will hang the core - if ((vx_thread_id() % NUM_THREADS) == 0) { - vx_barrier(0, NUM_WARPS); - } + asm volatile("li x1, 0xa0a0a0a0"); + asm volatile("li x2, 0xa0a0a0a0"); + asm volatile("li x3, 0xa0a0a0a0"); + asm volatile("li x4, 0xa0a0a0a0"); + asm volatile("li x5, 0xa0a0a0a0"); + asm volatile("li x6, 0xa0a0a0a0"); + asm volatile("li x7, 0xa0a0a0a0"); + asm volatile("li x8, 0xa0a0a0a0"); + asm volatile("li x9, 0xa0a0a0a0"); + asm volatile("li x10, 0xa0a0a0a0"); + asm volatile("li x11, 0xa0a0a0a0"); + asm volatile("li x12, 0xa0a0a0a0"); + asm volatile("li x13, 0xa0a0a0a0"); + asm volatile("li x14, 0xa0a0a0a0"); + asm volatile("li x15, 0xa0a0a0a0"); + asm volatile("li x16, 0xa0a0a0a0"); + asm volatile("li x17, 0xa0a0a0a0"); + asm volatile("li x18, 0xa0a0a0a0"); + asm volatile("li x19, 0xa0a0a0a0"); + asm volatile("li x20, 0xa0a0a0a0"); + asm volatile("li x21, 0xa0a0a0a0"); + asm volatile("li x22, 0xa0a0a0a0"); + asm volatile("li x23, 0xa0a0a0a0"); + asm volatile("li x24, 0xa0a0a0a0"); + asm volatile("li x25, 0xa0a0a0a0"); + asm volatile("li x26, 0xa0a0a0a0"); + asm volatile("li x27, 0xa0a0a0a0"); + asm volatile("li x28, 0xa0a0a0a0"); + asm volatile("li x29, 0xa0a0a0a0"); + asm volatile("li x30, 0xa0a0a0a0"); + asm volatile("li x31, 0xa0a0a0a0"); + asm volatile("csrr a0, 0xcc1"); + asm volatile("beqz a0, bar"); + asm volatile("vx_tmc zero"); + asm volatile("bar:"); + asm volatile("vx_bar zero, a0"); + // if ((vx_thread_id() % NUM_THREADS) == 0) { + // vx_barrier(0, NUM_WARPS); + // } vx_tmc(0); } @@ -35,7 +71,7 @@ int main() { // spawn a single warp in every core const uint32_t grid_size = NUM_THREADS * NUM_CORES; #ifdef RADIANCE - vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); + vx_spawn_tasks_cluster(NUM_THREADS_IN_CLUSTER, (vx_spawn_tasks_cb)kernel_body, arg); #else vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #endif diff --git a/tests/regression/unaligned/Makefile b/tests/regression/unaligned/Makefile new file mode 100644 index 00000000..a314fc4b --- /dev/null +++ b/tests/regression/unaligned/Makefile @@ -0,0 +1,9 @@ +PROJECT = unaligned + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/unaligned/common.h b/tests/regression/unaligned/common.h new file mode 100644 index 00000000..c16f6f04 --- /dev/null +++ b/tests/regression/unaligned/common.h @@ -0,0 +1,13 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t placeholder; +} kernel_arg_t; + +#endif diff --git a/tests/regression/unaligned/kernel.cpp b/tests/regression/unaligned/kernel.cpp new file mode 100644 index 00000000..97258d07 --- /dev/null +++ b/tests/regression/unaligned/kernel.cpp @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include "common.h" + +#define NUM_THREADS_IN_CLUSTER 32 +#define NUM_CLUSTERS 1 + +#define rd_cycles_force(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) +#define rd_cycles(x) rd_cycles_force(x) +#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) +#define PRINT_BUF ((char *) (0xff020000UL)) +#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) + +inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) __attribute__((convergent)) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +#define ADDR0 0xff008004UL +#define ADDR1 0xff009004UL +#define ADDR2 0xff00a004UL +#define ADDR3 0xff00b004UL + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) __attribute__((convergent)) { + size_t t = (size_t) (task_id * 4) % 32; + if (t == 0) { + for (int j = 0; j < 0x400; j += 0x100) { + for (int i = 0; i < 8; i++) { + *((volatile uint32_t *) (ADDR0 + j + i * 4)) = 0xbeef; + *((volatile uint32_t *) (ADDR1 + j + i * 4)) = 0xbeef; + } + } + } + threadblock_barrier(0, 1); + // for (int i = 0; i < 8; i++) { + if (HW_TID() % 8 < 5) { + // if (true) { + asm volatile("lower_block:"); + volatile uint32_t a = *((volatile uint32_t *) (ADDR0 + 0x000 + t)); + volatile uint32_t b = *((volatile uint32_t *) (ADDR0 + 0x100 + t)); + volatile uint32_t c = *((volatile uint32_t *) (ADDR0 + 0x200 + t)); + volatile uint32_t d = *((volatile uint32_t *) (ADDR0 + 0x300 + t)); + + volatile uint32_t u = *((volatile uint32_t *) (ADDR1 + 0x000 + t)); + volatile uint32_t v = *((volatile uint32_t *) (ADDR1 + 0x100 + t)); + volatile uint32_t w = *((volatile uint32_t *) (ADDR1 + 0x200 + t)); + volatile uint32_t x = *((volatile uint32_t *) (ADDR1 + 0x300 + t)); + + *((volatile uint32_t *) (ADDR2 + 0x000 + t)) = a; + *((volatile uint32_t *) (ADDR2 + 0x100 + t)) = b; + *((volatile uint32_t *) (ADDR2 + 0x200 + t)) = c; + *((volatile uint32_t *) (ADDR2 + 0x300 + t)) = d; + + *((volatile uint32_t *) (ADDR3 + 0x000 + t)) = u; + *((volatile uint32_t *) (ADDR3 + 0x100 + t)) = v; + *((volatile uint32_t *) (ADDR3 + 0x200 + t)) = w; + *((volatile uint32_t *) (ADDR3 + 0x300 + t)) = x; + } else { + asm volatile("upper_block:"); + volatile uint32_t a = *((volatile uint32_t *) (ADDR1 + 0x000 + t)); + volatile uint32_t b = *((volatile uint32_t *) (ADDR1 + 0x100 + t)); + volatile uint32_t c = *((volatile uint32_t *) (ADDR1 + 0x200 + t)); + volatile uint32_t d = *((volatile uint32_t *) (ADDR1 + 0x300 + t)); + + volatile uint32_t u = *((volatile uint32_t *) (ADDR0 + 0x000 + t)); + volatile uint32_t v = *((volatile uint32_t *) (ADDR0 + 0x100 + t)); + volatile uint32_t w = *((volatile uint32_t *) (ADDR0 + 0x200 + t)); + volatile uint32_t x = *((volatile uint32_t *) (ADDR0 + 0x300 + t)); + + // for (int y = 4; y < 8; y++) { + // if (task_id == y) { + // PRINTF("Task ID: %d, a: %x, b: %x, c: %x, d: %x\n", task_id, a, b, c, d); + // PRINTF("Task ID: %d, u: %x, v: %x, w: %x, x: %x\n", task_id, u, v, w, x); + // } + // } + // threadblock_barrier(1, 1); + + *((volatile uint32_t *) (ADDR3 + 0x000 + t)) = a; + *((volatile uint32_t *) (ADDR3 + 0x100 + t)) = b; + *((volatile uint32_t *) (ADDR3 + 0x200 + t)) = c; + *((volatile uint32_t *) (ADDR3 + 0x300 + t)) = d; + + *((volatile uint32_t *) (ADDR2 + 0x000 + t)) = u; + *((volatile uint32_t *) (ADDR2 + 0x100 + t)) = v; + *((volatile uint32_t *) (ADDR2 + 0x200 + t)) = w; + *((volatile uint32_t *) (ADDR2 + 0x300 + t)) = x; + } + // } + threadblock_barrier(2, 1); + PRINTF("."); + if (task_id == 0) { + bool correct = true; + PRINTF("\n"); + for (int j = 0; j < 0x400; j += 0x100) { + for (int i = 0; i < 8; i++) { + int v2 = *((volatile uint32_t *) (ADDR2 + i * 4 + j)); + if (v2 != 0xbeef) { + correct = false; + PRINTF("mismatch at %x, got %x\n", ADDR2 + i * 4 + j, v2); + } + int v3 = *((volatile uint32_t *) (ADDR3 + i * 4 + j)); + if (v3 != 0xbeef) { + correct = false; + PRINTF("mismatch at %x, got %x\n", ADDR3 + i * 4 + j, v3); + } + } + } + if (correct) { + PRINTF("test passed\n"); + } + } +} + +int main() __attribute__((convergent)) { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + + const uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER; + const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; +} diff --git a/tests/regression/unaligned/kernel.minimal.cpp b/tests/regression/unaligned/kernel.minimal.cpp new file mode 100644 index 00000000..1629f6ef --- /dev/null +++ b/tests/regression/unaligned/kernel.minimal.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include "common.h" + +#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) + +inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) __attribute__((convergent)) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +#define ADDR0 0xff008004UL +#define ADDR1 0xff009004UL + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + // size_t t = (size_t) (task_id * 4) % 32; + asm volatile("nop"); + for (int i = 0; i < 8; i++) { + if (i == 0) { + if ((HW_TID() & 0x7) < 2) { + asm volatile("lower_block:"); + volatile uint32_t a = *((volatile uint32_t *) (ADDR0)); + // *((volatile uint32_t *) (ADDR2)) = a; + volatile uint32_t b = a + 1; + } else { + asm volatile("upper_block:"); + volatile uint32_t a = *((volatile uint32_t *) (ADDR1)); + // *((volatile uint32_t *) (ADDR3)) = a; + volatile uint32_t b = a + 1; + } + } + volatile uint32_t a = *((volatile uint32_t *) (ADDR1)); + } + threadblock_barrier(2, 2); +} + +int main() { // __attribute__((convergent)) { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + + vx_spawn_tasks_cluster(64, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; +} diff --git a/tests/regression/unaligned/main.cpp b/tests/regression/unaligned/main.cpp new file mode 100644 index 00000000..a1b1d384 --- /dev/null +++ b/tests/regression/unaligned/main.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_dev_close(device); + } +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} diff --git a/tests/regression/unaligned/unaligned b/tests/regression/unaligned/unaligned new file mode 100755 index 00000000..7fa0b6c0 Binary files /dev/null and b/tests/regression/unaligned/unaligned differ