diff --git a/src/test/Makefile b/src/test/Makefile
index b3f77bc7..3f7db684 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -4,11 +4,13 @@ HARPEM  = ../harptool -E
 HARPDIS = ../harptool -D
 4BARCH  = 4b16/16/2
 
-all: simple.bin sieve.bin 2thread.bin simple.4b.bin sieve.4b.bin 2thread.4b.bin bubble.bin bubble.4b.bin dotprod.bin dotprod.4b.bin matmul.bin matmul.4b.bin
+all: simple.bin sieve.bin 2thread.bin simple.4b.bin sieve.4b.bin 2thread.4b.bin bubble.bin bubble.4b.bin dotprod.bin dotprod.4b.bin matmul.bin matmul.4b.bin \
+ matmul-mt.s
 
-run: simple.out sieve.out 2thread.out simple.4b.out sieve.4b.out 2thread.4b.out bubble.out bubble.4b.out dotprod.out dotprod.4b.out matmul.out matmul.4b.out
+run: simple.out sieve.out 2thread.out simple.4b.out sieve.4b.out 2thread.4b.out bubble.out bubble.4b.out dotprod.out dotprod.4b.out matmul.out matmul.4b.out\
+ matmul-mt.out
 
-disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d
+disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d
 
 %.4b.out : %.4b.bin
 	$(HARPEM) -a $(4BARCH) -c $< > $@
@@ -40,6 +42,9 @@ dotprod.bin : boot.HOF lib.HOF dotprod.HOF
 matmul.bin : boot.HOF lib.HOF matmul.HOF
 	$(HARPLD) -o $@ $^
 
+matmul-mt.bin : boot.HOF lib.HOF matmul-mt.HOF
+	$(HARPLD) -o $@ $^
+
 simple.4b.bin : boot.4b.HOF lib.4b.HOF simple.4b.HOF
 	$(HARPLD) --arch $(4BARCH) -o $@ $^
 
diff --git a/src/test/matmul-mt.s b/src/test/matmul-mt.s
index 643942ce..8caca0dd 100644
--- a/src/test/matmul-mt.s
+++ b/src/test/matmul-mt.s
@@ -41,7 +41,7 @@ ploop: ld %r7, %r1, #0;
 matgen: ldi %r2, #0;
         st %r5, %r2, retaddr;
         ldi %r2, #1;
-        shli %r2, %r2, (`__WORD + 1); /* Multiply r0 by 2*__WORD */
+        shl %r2, %r2, %r1;
         shl %r2, %r2, %r1;
         ori %r3, %r0, #0;
 
@@ -59,7 +59,8 @@ mgloop: jali %r5, randf;
 /* Write the matrix product of square matrix at (%r0) and (%r1) to (%r2). The
    size of these matrices is 2^Nx2^N, where N = %r3 */
 
-matmul: ldi %r4, #1;
+matmul: ori %r22, %r5, #0;
+        ldi %r4, #1;
         ldi %r10, (`__WORD); /* ` is the log base 2 operator */
         shl %r4, %r4, %r3;
         add %r10, %r10, %r3;
@@ -67,9 +68,25 @@ matmul: ldi %r4, #1;
         shl %r14, %r14, %r10;
 
         divi %r17, %r14, THREADS; /* Spawn threads */
-sloop:  
+        ori %r18, %r0, #0;
+        ori %r19, %r2, #0;
+        ldi %r20, #0;
+sloop:  add %r0, %r0, %r17;
+        add %r2, %r2, %r17;
+        addi %r20, %r20, #1;
+        subi %r21, %r20, THREADS;
+        rtop @p0, %r21;
+        notp @p1, @p0;
+  @p1 ? clone %r20;
+  @p0 ? jmpi sloop;
 
-        jmpr %r5;
+        ori %r0, %r18, #0;
+        ori %r2, %r19, #0;
+        clone %r20;
+
+         jalis %r5, matmulthd;  
+
+        jmpr %r22;
 
 /* One thread of matrix multiplication. Expected register values at start:
  *   %r0 - matrix a pointer (plus offset)
@@ -115,10 +132,11 @@ iloop:     ld %r7, %r11, #0;
 
            jmprt %r5;
 
-.align 4096
 .perm rw
-matrix_a: .space 64;
-matrix_b: .space 64;
+.align 4096
+
+matrix_a: .space 64
+matrix_b: .space 64
 matrix_r: .space 64
 
 retaddr: .word 0