From 466ac4e7cc5d30148e94d4b55a64ce99ef93bcc7 Mon Sep 17 00:00:00 2001 From: chad Date: Mon, 8 Sep 2014 18:38:13 -0400 Subject: [PATCH 1/4] Don't use qsim. --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index dd57e573..6cf0a435 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,7 +8,7 @@ PREFIX ?= /usr/local LIB_OBJS=args.o obj.o mem.o core.o instruction.o enc.o util.o lex.yy.o -all: harptool libharplib.so libharplib.a libqsim-harp.so +all: harptool libharplib.so libharplib.a # libqsim-harp.so # Use -static so we don't have to install the library in order to just run # Harptool. From 7529be422bfe3bfa289f382996a94eab7868702f Mon Sep 17 00:00:00 2001 From: chad Date: Mon, 8 Sep 2014 18:40:29 -0400 Subject: [PATCH 2/4] Beginnings of ipdom support. --- src/include/core.h | 10 ++++++++++ src/include/instruction.h | 3 ++- src/instruction.cpp | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/include/core.h b/src/include/core.h index ce719bc1..d447b7d3 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -6,6 +6,7 @@ #include #include +#include #include "types.h" #include "archdef.h" @@ -47,6 +48,12 @@ namespace Harp { #endif }; + // Entry in the IPDOM Stack + struct DomStackEntry { + std::vector tmask; + Word pc; + }; + class Core { public: Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id=0); @@ -67,6 +74,9 @@ namespace Harp { std::vector > > reg; std::vector > > pred; + std::vector tmask; + std::stack domStack; + std::vector shadowReg; std::vector shadowPReg; diff --git a/src/include/instruction.h b/src/include/instruction.h index ed24eeab..d421cd6a 100644 --- a/src/include/instruction.h +++ b/src/include/instruction.h @@ -30,7 +30,8 @@ namespace Harp { JALI, JALR, JMPI, JMPR, CLONE, JALIS, JALRS, JMPRT, LD, ST, LDI, RTOP, ANDP, ORP, XORP, NOTP, ISNEG, ISZERO, HALT, TRAP, JMPRU, SKEP, RETI, TLBRM, - ITOF, FTOI, FADD, FSUB, FMUL, FDIV, FNEG, WSPAWN }; + ITOF, FTOI, FADD, FSUB, FMUL, FDIV, FNEG, WSPAWN, + SPLIT, JOIN }; enum ArgClass { AC_NONE, AC_2REG, AC_2IMM, AC_3REG, AC_3PREG, AC_3IMM, AC_3REGSRC, AC_1IMM, AC_1REG, AC_3IMMSRC, AC_PREG_REG, AC_2PREG, AC_2REGSRC diff --git a/src/instruction.cpp b/src/instruction.cpp index 61ec008e..bf7f0c1a 100644 --- a/src/instruction.cpp +++ b/src/instruction.cpp @@ -81,6 +81,8 @@ Instruction::InstTableEntry Instruction::instTable[] = { {"fdiv", false, false, false, false, AC_3REG, ITYPE_FPDIV }, {"fneg", false, false, false, false, AC_2REG, ITYPE_FPBASIC }, {"wspawn", false, false, true, false, AC_2REGSRC, ITYPE_NULL }, + {"split", false, false, true, false, AC_NONE, ITYPE_NULL }, + {"join", false, false, true, false, AC_NONE, ITYPE_NULL }, {NULL,false,false,false,false,AC_NONE,ITYPE_NULL}/////// End of table. }; From 56aaff1f8784a4ff34dbdfd12cb921643bb7a085 Mon Sep 17 00:00:00 2001 From: cdkersey Date: Tue, 9 Sep 2014 03:08:23 -0400 Subject: [PATCH 3/4] Fully-functioning spawn and join instructions. --- src/core.cpp | 7 +++++ src/include/core.h | 26 ++++++++++++++---- src/instruction.cpp | 37 +++++++++++++++++++++----- src/test/Makefile | 53 ++++++------------------------------- src/test/diverge.s | 64 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 56 deletions(-) create mode 100644 src/test/diverge.s diff --git a/src/core.cpp b/src/core.cpp index a4fc57aa..5413d71a 100644 --- a/src/core.cpp +++ b/src/core.cpp @@ -46,6 +46,8 @@ Core::Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id) : for (Word i = 0; i < a.getNPRegs(); ++i) { pred[j].push_back(Reg(id, regNum++)); } + + tmask.push_back(true); } /* Set initial register contents. */ @@ -131,6 +133,11 @@ void Core::step() { D_RAW(" ("); for (unsigned i = 0; i < shadowPReg.size(); ++i) D_RAW(shadowPReg[i]); D_RAW(')' << endl); + + D(3, "Thread mask:"); + D_RAW(" "); + for (unsigned i = 0; i < tmask.size(); ++i) D_RAW(tmask[i] << ' '); + D_RAW(endl); } #endif diff --git a/src/include/core.h b/src/include/core.h index d447b7d3..adf57d46 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -27,7 +27,7 @@ namespace Harp { Reg &operator=(T r) { val = r; doWrite(); return *this; } - operator T() { doRead(); return val; } + operator T() const { doRead(); return val; } void trunc(Size s) { Word mask((~0ull >> (sizeof(Word)-s)*8)); @@ -40,16 +40,32 @@ namespace Harp { #ifdef EMU_INSTRUMENTATION /* Access size here is 8, representing the register size of 64-bit cores. */ - void doWrite() { reg_doWrite(cpuId, regNum); } - void doRead() { reg_doRead(cpuId, regNum); } + void doWrite() const { reg_doWrite(cpuId, regNum); } + void doRead() const { reg_doRead(cpuId, regNum); } #else - void doWrite() {} - void doRead() {} + void doWrite() const {} + void doRead() const {} #endif }; // Entry in the IPDOM Stack struct DomStackEntry { + DomStackEntry( + unsigned p, const std::vector > >& m, Word pc + ): pc(pc), fallThrough(false) + { + std::cout << "New DomStackEntry:"; + for (unsigned i = 0; i < m.size(); ++i) { + tmask.push_back(!bool(m[i][p])); + std::cout << ' ' << bool(m[i][p]); + } + std::cout << std::endl; + } + + DomStackEntry(const std::vector &tmask): + tmask(tmask), fallThrough(true) {} + + bool fallThrough; std::vector tmask; Word pc; }; diff --git a/src/instruction.cpp b/src/instruction.cpp index bf7f0c1a..cba87abf 100644 --- a/src/instruction.cpp +++ b/src/instruction.cpp @@ -121,12 +121,16 @@ void Instruction::executeOn(Core &c) { return; } - /* Also throw exceptions on divergent branches. */ - if (predicated && instTable[op].controlFlow) { - bool p0 = c.pred[0][pred]; - for (Size t = 1; t < c.activeThreads; t++) { - if (c.pred[t][pred] != p0) throw DivergentBranchException(); + /* Also throw exceptions on non-masked divergent branches. */ + if (instTable[op].controlFlow) { + Size t, count, active; + for (t = 0, count = 0, active = 0; t < c.activeThreads; ++t) { + if ((!predicated || c.pred[t][pred]) && c.tmask[t]) ++count; + if (c.tmask[t]) ++active; } + + if (count != 0 && count != active) + throw DivergentBranchException(); } Size nextActiveThreads = c.activeThreads; @@ -135,8 +139,12 @@ void Instruction::executeOn(Core &c) { for (Size t = 0; t < c.activeThreads; t++) { vector > ®(c.reg[t]); vector > &pReg(c.pred[t]); + stack &domStack(c.domStack); - if (predicated && !pReg[pred]) continue; + // If this thread is masked out, don't execute the instruction, unless it's + // a split or join. + if (((predicated && !pReg[pred]) || !c.tmask[t]) && + op != SPLIT && op != JOIN) continue; Word memAddr; switch (op) { @@ -282,6 +290,23 @@ void Instruction::executeOn(Core &c) { case FDIV: reg[rdest] = Float(double(Float(reg[rsrc[0]], wordSz)) / double(Float(reg[rsrc[1]], wordSz)),wordSz); break; + case SPLIT:if (t == 0) { + // TODO: if mask becomes all-zero, fall through + DomStackEntry e(pred, c.pred, c.pc); + c.domStack.push(c.tmask); + c.domStack.push(e); + for (unsigned i = 0; i < e.tmask.size(); ++i) + c.tmask[i] = !e.tmask[i]; + } + break; + case JOIN: if (t == 0) { + // TODO: if mask becomes all-zero, fall through + if (!c.domStack.top().fallThrough) + c.pc = c.domStack.top().pc; + c.tmask = c.domStack.top().tmask; + c.domStack.pop(); + } + break; default: cout << "ERROR: Unsupported instruction: " << *this << "\n"; exit(1); diff --git a/src/test/Makefile b/src/test/Makefile index 3f7db684..7f2130bf 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -5,12 +5,14 @@ HARPDIS = ../harptool -D 4BARCH = 4b16/16/2 all: simple.bin sieve.bin 2thread.bin simple.4b.bin sieve.4b.bin 2thread.4b.bin bubble.bin bubble.4b.bin dotprod.bin dotprod.4b.bin matmul.bin matmul.4b.bin \ - matmul-mt.s + matmul-mt.bin diverge.bin run: simple.out sieve.out 2thread.out simple.4b.out sieve.4b.out 2thread.4b.out bubble.out bubble.4b.out dotprod.out dotprod.4b.out matmul.out matmul.4b.out\ - matmul-mt.out + matmul-mt.out diverge.out -disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d +disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d \ + bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d \ + diverge.d diverge.4b.d %.4b.out : %.4b.bin $(HARPEM) -a $(4BARCH) -c $< > $@ @@ -18,50 +20,11 @@ disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d b %.out : %.bin $(HARPEM) -c $< > $@ -2thread.bin : boot.HOF lib.HOF 2thread.HOF - $(HARPLD) -o 2thread.bin $^ - -2thread.4b.bin : boot.4b.HOF lib.4b.HOF 2thread.4b.HOF - $(HARPLD) --arch $(4BARCH) -o 2thread.4b.bin $^ - -bubble.bin : boot.HOF lib.HOF bubble.HOF - $(HARPLD) -o bubble.bin $^ - -bubble.4b.bin : boot.4b.HOF lib.4b.HOF bubble.4b.HOF - $(HARPLD) --arch $(4BARCH) -o bubble.4b.bin $^ - -simple.bin : boot.HOF lib.HOF simple.HOF - $(HARPLD) -o $@ $^ - -sieve.bin : boot.HOF lib.HOF sieve.HOF - $(HARPLD) -o $@ $^ - -dotprod.bin : boot.HOF lib.HOF dotprod.HOF - $(HARPLD) -o $@ $^ - -matmul.bin : boot.HOF lib.HOF matmul.HOF - $(HARPLD) -o $@ $^ - -matmul-mt.bin : boot.HOF lib.HOF matmul-mt.HOF - $(HARPLD) -o $@ $^ - -simple.4b.bin : boot.4b.HOF lib.4b.HOF simple.4b.HOF +%.4b.bin : boot.4b.HOF lib.4b.HOF %.4b.HOF $(HARPLD) --arch $(4BARCH) -o $@ $^ -sieve.4b.bin : boot.4b.HOF lib.4b.HOF sieve.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -dotprod.4b.bin : boot.4b.HOF lib.4b.HOF dotprod.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -matmul.4b.bin : boot.4b.HOF lib.4b.HOF matmul.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -%.4b.bin : %.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $< - -%.bin : %.HOF - $(HARPLD) -o $@ $< +%.bin : boot.HOF lib.HOF %.HOF + $(HARPLD) -o $@ $^ %.4b.HOF : %.s $(HARPAS) --arch $(4BARCH) -o $@ $< diff --git a/src/test/diverge.s b/src/test/diverge.s new file mode 100644 index 00000000..197c5461 --- /dev/null +++ b/src/test/diverge.s @@ -0,0 +1,64 @@ +/******************************************************************************* + Harptools by Chad D. Kersey, Summer 2011 +******************************************************************************** + + Sample HARP assmebly program. + +*******************************************************************************/ +/* Divergent branch: test immediate postdominator branch divergence support. */ +.def THREADS 8 + +.align 4096 +.perm x +.entry +.global +entry: + ldi %r0, #1 + ldi %r1, THREADS +sloop: clone %r0 + + addi %r0, %r0, #1 + sub %r2, %r1, %r0 + rtop @p0, %r2 + @p0 ? jmpi sloop + + ldi %r0, #0 + jalis %r5, %r1, dthread; + + ldi %r0, #0 + ldi %r1, (__WORD * THREADS) + +ploop: ld %r7, %r0, array + jali %r5, printdec + + addi %r0, %r0, __WORD + sub %r7, %r1, %r0 + rtop @p0, %r7 + @p0 ? jmpi ploop + + trap; + + +dthread: ldi %r1, #10 + ldi %r2, #0 + +loop: andi %r3, %r0, #1 + rtop @p1, %r3 + @p1 ? split + @p1 ? jmpi else + add %r2, %r2, %r0 + jmpi after +else: sub %r2, %r2, %r0 +after: join + + subi %r1, %r1, #1 + rtop @p0, %r1 + @p0 ? jmpi loop + + shli %r4, %r0, (`__WORD) + st %r2, %r4, array + + jmprt %r5; + +.align 4096 +array: .space 4096 From dde43648fd6a322700ee4b6f9d2e631e9b83d098 Mon Sep 17 00:00:00 2001 From: cdkersey Date: Tue, 9 Sep 2014 03:19:05 -0400 Subject: [PATCH 4/4] Documented split and join. --- doc/harp_iset.tex | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/harp_iset.tex b/doc/harp_iset.tex index 11415dc6..b066bc99 100644 --- a/doc/harp_iset.tex +++ b/doc/harp_iset.tex @@ -132,7 +132,8 @@ The bit fields in the instruction encodings depend heavily on this quality. 30 "skep" 1REG 31 "reti" NONE 32 "tlbrm" 1REG 33 "itof" 2REG 34 "ftoi" 2REG 35 "fadd" 3REG 36 "fsub" 3REG 37 "fmul" 3REG 38 "fdiv" 3REG - 39 "fneg" 2REG + 39 "fneg" 2REG 3a "wspawn" 2REG 3b "split" NONE + 3c "join" NONE \end{verbatim} \subsection{Word Encoding} @@ -346,6 +347,8 @@ format, which can be fixed point or floating point. \texttt{jalis} \%link, \%n, \textsc{\#RelDest}&Jump and link immediate, spawning N active lanes.\\ \texttt{jalrs} \%link, \%n, \%dest&Jump and link indirect, spawning N active lanes.\\ \texttt{jmprt} \%addr&Jump indirect, terminating execution on all but a single lane.\\ +\texttt{split}&Control flow diverge.\\ +\texttt{join}&Control flow reconverge.\\ \end{tabular} \end{center} @@ -412,6 +415,14 @@ The current response to this is to trap to the operating system (interrupt numbe The \texttt{clone}, \texttt{jalis}, \texttt{jalrs}, and \texttt{jmprt} instructions form the basis of SIMD context control in the HARP instruction set. Context is created using \texttt{clone}, the waiting threads are spawned using \texttt{jalrs} or \texttt{jalis}, ``jump-and-link immediate/register and spawn'', and finally the parallel section returns using \texttt{jmprt}, ``jump register and terminate'', best thought of as ``return and terminate.'' +There are times when a control flow operation will need to be predicated, going one direction on some lanes and the other direction on other lanes. +For this, the HARP instruction set provides the \texttt{split} and \texttt{join} instructions. +When a predicated \texttt{split} is first encountered, only the lanes for which the \texttt{split}'s predicate are true are allowed to continue. +The other lanes are masked out until the corresponding \texttt{join} is encountered. +The first time \texttt{join} is reached, control flow returns to the instruction following the corresponding \texttt{split} with the set of masked-out lanes complemented. +The second time the same \texttt{join} is reached, control flow falls through and the original lane mask is restored. +A hardware stack is maintained to keep track of nested \texttt{split}s. + \section{Default I/O Devices} The emulator currently only supports a single I/O device, simple console I/O. Writing to the address \texttt{0x800...0} (an address with its MSB set and all other bits cleared) causes text to be written to the display.