From d2bd560593ffaf91fee2bce61cbae6c8966014e9 Mon Sep 17 00:00:00 2001 From: fares Date: Sat, 23 Nov 2019 20:37:14 -0500 Subject: [PATCH] OpenCL benchmarks running --- runtime/intrinsics/vx_intrinsics.h | 4 ++ runtime/intrinsics/vx_intrinsics.s | 13 +++++ runtime/newlib/newlib.c | 14 ++--- runtime/startup/vx_start.s | 4 +- runtime/vx_api/vx_api.c | 86 ++++++++++++++++++++++++++---- simX/core.cpp | 23 +++++--- simX/include/archdef.h | 4 +- simX/include/core.h | 2 + simX/instruction.cpp | 8 ++- simX/util.cpp | 2 +- 10 files changed, 130 insertions(+), 30 deletions(-) diff --git a/runtime/intrinsics/vx_intrinsics.h b/runtime/intrinsics/vx_intrinsics.h index bad54efe..df85807a 100644 --- a/runtime/intrinsics/vx_intrinsics.h +++ b/runtime/intrinsics/vx_intrinsics.h @@ -31,6 +31,10 @@ unsigned vx_threadID(void); // Get hardware warp ID unsigned vx_warpID(void); +// Get Number cycles/Inst +unsigned vx_getCycles(void); +unsigned vx_getInst(void); + void vx_resetStack(void); diff --git a/runtime/intrinsics/vx_intrinsics.s b/runtime/intrinsics/vx_intrinsics.s index 0d0fdfbb..a9904840 100644 --- a/runtime/intrinsics/vx_intrinsics.s +++ b/runtime/intrinsics/vx_intrinsics.s @@ -49,6 +49,19 @@ vx_threadID: csrr a0, 0x20 # read thread IDs ret +.type vx_getCycles, @function +.global vx_getCycles +vx_getCycles: + csrr a0, 0x26 # read thread IDs + ret + + +.type vx_getInst, @function +.global vx_getInst +vx_getInst: + csrr a0, 0x25 # read thread IDs + ret + .type vx_resetStack, @function .global vx_resetStack diff --git a/runtime/newlib/newlib.c b/runtime/newlib/newlib.c index 873ebf72..5195c05e 100644 --- a/runtime/newlib/newlib.c +++ b/runtime/newlib/newlib.c @@ -148,7 +148,7 @@ int _fstat(int file, struct stat * st) int _isatty (int file) { - vx_print_str("Hello from _isatty\n"); + // vx_print_str("Hello from _isatty\n"); return 1; } @@ -237,8 +237,8 @@ static int head_end = (int) 0x20000000; void * _sbrk (int nbytes) { - vx_print_str("Hello from _sbrk\n"); - vx_printf("nbytes: ", nbytes); + // vx_print_str("Hello from _sbrk\n"); + // vx_printf("nbytes: ", nbytes); //if (nbytes < 0) //vx_print_str("nbytes less than zero\n"); // printf("nBytes: %d\n", nbytes); @@ -248,7 +248,7 @@ void * _sbrk (int nbytes) nbytes = nbytes * -1; } - vx_printf("New nbytes: ", nbytes); + // vx_printf("New nbytes: ", nbytes); // if (nbytes > 10240) // { @@ -260,9 +260,9 @@ void * _sbrk (int nbytes) { int base = heap_start; heap_start += nbytes; - vx_print_str("_sbrk returning: "); - vx_print_hex((unsigned) base); - vx_print_str("\n"); + // vx_print_str("_sbrk returning: "); + // vx_print_hex((unsigned) base); + // vx_print_str("\n"); return (void *) base; } else diff --git a/runtime/startup/vx_start.s b/runtime/startup/vx_start.s index d73cb5f2..3cd421df 100644 --- a/runtime/startup/vx_start.s +++ b/runtime/startup/vx_start.s @@ -20,7 +20,7 @@ _start: # Initialize SP # la sp, __stack_top la a1, vx_set_sp - li a0, 4 + li a0, 32 .word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN) jal vx_set_sp li a0, 1 @@ -46,7 +46,7 @@ _start: .type vx_set_sp, @function .global vx_set_sp vx_set_sp: - li a0, 4 + li a0, 32 .word 0x0005006b # tmc 4 .option push diff --git a/runtime/vx_api/vx_api.c b/runtime/vx_api/vx_api.c index 6cfc65b1..154e49ba 100644 --- a/runtime/vx_api/vx_api.c +++ b/runtime/vx_api/vx_api.c @@ -7,6 +7,8 @@ extern "C" { #endif +#define TOTAL_WARPS 2 +#define TOTAL_THREADS 16 func_t global_function_pointer; // void (func_t)(void *) @@ -46,16 +48,39 @@ uint8_t * pocl_args; uint8_t * pocl_ctx; vx_pocl_workgroup_func pocl_pfn; +unsigned global_z; +unsigned global_y; +unsigned global_x; + void pocl_spawn_real() { vx_tmc(pocl_threads); - int x = vx_threadID(); - int y = vx_warpID(); + int base_x = vx_threadID(); + int base_y = vx_warpID(); - (pocl_pfn)( pocl_args, pocl_ctx, x, y, 0); + int local_x; + int local_y; - if (y != 0) + for (int iter_z = 0; iter_z < global_z; iter_z++) + { + for (int iter_x = 0; iter_x < global_x; iter_x++) + { + for (int iter_y = 0; iter_y < global_y; iter_y++) + { + + local_x = (iter_x * TOTAL_THREADS) + base_x; + local_y = (iter_y * TOTAL_WARPS ) + base_y; + + (pocl_pfn)( pocl_args, pocl_ctx, local_x, local_y, iter_z); + + } + } + } + + // (pocl_pfn)( pocl_args, pocl_ctx, x, y, 0); + + if (base_y != 0) { vx_tmc(0); } @@ -66,24 +91,67 @@ void pocl_spawn_real() void pocl_spawn(struct context_t * ctx, const void * pfn, void * arguments) { - if (ctx->num_groups[2] > 1) + + // printf("ctx->num_groups[0]: %d\n", ctx->num_groups[0]); + // printf("ctx->num_groups[1]: %d\n", ctx->num_groups[1]); + // printf("ctx->num_groups[2]: %d\n", ctx->num_groups[2]); + + // printf("\n\n"); + + // printf("ctx->local_size[0]: %d\n", ctx->local_size[0]); + // printf("ctx->local_size[1]: %d\n", ctx->local_size[1]); + // printf("ctx->local_size[2]: %d\n", ctx->local_size[2]); + if (ctx->num_groups[0] > TOTAL_THREADS) { - printf("ERROR: pocl_spawn doesn't support Z dimension yet!\n"); - return; + pocl_threads = TOTAL_THREADS; + global_x = ctx->num_groups[0] / TOTAL_THREADS; + printf("pocl_threads: %d\n", pocl_threads); + // printf("global_x: %d\n", global_x); + } + else + { + pocl_threads = ctx->num_groups[0]; + global_x = 1; + // printf("pocl_threads: %d\n", pocl_threads); + // printf("global_x: %d\n", global_x); } - pocl_threads = ctx->num_groups[0]; + + global_z = ctx->num_groups[2]; pocl_pfn = (vx_pocl_workgroup_func) pfn; pocl_ctx = (uint8_t *) ctx; pocl_args = (uint8_t *) arguments; if (ctx->num_groups[1] > 1) { - vx_wspawn(ctx->num_groups[1], (unsigned) &pocl_spawn_real); + if (ctx->num_groups[1] > TOTAL_WARPS) + { + global_y = ctx->num_groups[1] / TOTAL_WARPS; + vx_wspawn(TOTAL_WARPS, (unsigned) &pocl_spawn_real); + // printf("global_y: %d\n", global_y); + // printf("Warps: %d\n", TOTAL_WARPS); + } + else + { + global_y = 1; + vx_wspawn(ctx->num_groups[1], (unsigned) &pocl_spawn_real); + // printf("global_y: %d\n", global_y); + // printf("Warps: %d\n", ctx->num_groups[1]); + } } + unsigned starting_cycles = vx_getCycles(); + unsigned starting_inst = vx_getInst(); + pocl_spawn_real(); + unsigned end_cycles = vx_getCycles(); + unsigned end_inst = vx_getInst(); + + + printf("pocl_spawn: Total Cycles: %d\n", (end_cycles - starting_cycles)); + printf("pocl_spawn: Total Inst : %d\n", (end_inst - starting_inst )); + // int z; // int y; // int x; diff --git a/simX/core.cpp b/simX/core.cpp index 7f43a39c..a7f32425 100644 --- a/simX/core.cpp +++ b/simX/core.cpp @@ -106,7 +106,7 @@ void Harp::reg_doWrite(Word cpuId, Word regNum) { #endif Core::Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id): - a(a), iDec(d), mem(mem), steps(4) + a(a), iDec(d), mem(mem), steps(4), num_cycles(0), num_instructions(0) { release_warp = false; foundSchedule = true; @@ -162,12 +162,13 @@ bool Core::interrupt(Word r0) { void Core::step() { - cout << "\n\n\n------------------------------------------------------\n"; + D(3, "\n\n\n------------------------------------------------------"); D(3, "Started core::step" << flush); steps++; - D(3, "CYCLE: " << steps); + this->num_cycles++; + D(3, "CYCLE: " << this->num_cycles); D(3, "Stalled Warps:"); for (int widd = 0; widd < a.getNWarps(); widd++) @@ -407,10 +408,12 @@ void Core::fetch() if (foundSchedule) { D(3, "Core step stepping warp " << schedule_w << '[' << w[schedule_w].activeThreads << ']'); + this->num_instructions = this->num_instructions + w[schedule_w].activeThreads; + // this->num_instructions++; w[schedule_w].step(&inst_in_fetch); D(3, "Now " << w[schedule_w].activeThreads << " active threads in " << schedule_w << flush); - this->getCacheDelays(&inst_in_fetch); + // this->getCacheDelays(&inst_in_fetch); D(3, "Got cache delays" << flush); if (inst_in_fetch.stall_warp) { @@ -444,7 +447,10 @@ void Core::fetch() { D(3, " 0"); } - if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) cout << ','; + if (j != w[schedule_w].tmask.size()-1 || schedule_w != w.size()-1) + { + D(3, ','); + } } D(3, "\nPrinted active threads" << flush); // #endif @@ -600,7 +606,7 @@ void Core::execute_unit() } else { - cout << "&&&&&&&&&&&&&&&&&&&&&&&& EXECUTE SRCS NOT READY\n"; + D(3, "&&&&&&&&&&&&&&&&&&&&&&&& EXECUTE SRCS NOT READY"); inst_in_scheduler.stalled = true; // INIT_TRACE(inst_in_exe); do_nothing = true; @@ -759,8 +765,9 @@ void Warp::step(trace_inst_t * trace_inst) { bool fetchMore; fetchMore = false; - unsigned fetchSize(wordSize - (pc+fetchPos)%wordSize); - fetchBuffer.resize(fetchPos + fetchSize); + // unsigned fetchSize(wordSize - (pc+fetchPos)%wordSize); + unsigned fetchSize = 4; + fetchBuffer.resize(fetchSize); Word fetched = core->mem.fetch(pc + fetchPos, supervisorMode); writeWord(fetchBuffer, fetchPos, fetchSize, fetched); decPos = 0; diff --git a/simX/include/archdef.h b/simX/include/archdef.h index b8599592..4a071c0d 100644 --- a/simX/include/archdef.h +++ b/simX/include/archdef.h @@ -23,8 +23,8 @@ namespace Harp { encChar = 'w'; nRegs = 32; nPRegs = 0; - nThds = 8; - nWarps = 8; + nThds = 32; + nWarps = 32; extent = EXT_WARPS; diff --git a/simX/include/core.h b/simX/include/core.h index 37f29054..b4cfd112 100644 --- a/simX/include/core.h +++ b/simX/include/core.h @@ -144,6 +144,8 @@ namespace Harp { Word interruptEntry; unsigned long steps; + unsigned long num_cycles; + unsigned long num_instructions; std::vector w; std::map > b; // Barriers int schedule_w; diff --git a/simX/instruction.cpp b/simX/instruction.cpp index 843e6e1d..991bea99 100644 --- a/simX/instruction.cpp +++ b/simX/instruction.cpp @@ -849,6 +849,12 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { { reg[rdest] = c.id; D(2, "CSR Reading wid " << hex << immsrc << dec << " and returning " << reg[rdest]); + } else if (immsrc == 0x25) + { + reg[rdest] = c.core->num_instructions; + } else if (immsrc == 0x26) + { + reg[rdest] = c.core->num_cycles; } // switch (func3) // { @@ -2225,7 +2231,7 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) { } // break; - cout << "outside case" << endl << flush; + // cout << "outside case" << endl << flush; } diff --git a/simX/util.cpp b/simX/util.cpp index 9f4e9bdc..09b2ec0c 100644 --- a/simX/util.cpp +++ b/simX/util.cpp @@ -52,7 +52,7 @@ Byte Harp::readByte(const vector &b, Size &n) { } Word_u Harp::readWord(const vector &b, Size &n, Size wordSize) { - if (b.size() - n < wordSize) throw OutOfBytes(); + // if (b.size() - n < wordSize) throw OutOfBytes(); Word_u w(0); n += wordSize; // std::cout << "wordSize: " << wordSize << "\n";