integer reduction unit

This commit is contained in:
joshua
2024-03-06 01:39:17 -08:00
parent 5f2b10b8a6
commit beb3dce46d
7 changed files with 587 additions and 10 deletions

View File

@@ -115,7 +115,7 @@
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
`define INST_MOD_BITS 4
`define INST_FMT_BITS 2
///////////////////////////////////////////////////////////////////////////////
@@ -140,6 +140,7 @@
`define INST_ALU_IS_BR(mod) mod[0]
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_ALU_IS_RED(mod) mod[3]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
@@ -176,6 +177,17 @@
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
`define INST_M_IS_REM(op) op[1]
`define INST_RED_ADD 4'b0000
`define INST_RED_ADDU 4'b1000
`define INST_RED_MIN 4'b0001
`define INST_RED_MINU 4'b1001
`define INST_RED_MAX 4'b0010
`define INST_RED_MAXU 4'b1010
`define INST_RED_AND 4'b0011
`define INST_RED_OR 4'b0100
`define INST_RED_XOR 4'b0101
`define INST_RED_BITS 4
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010

View File

@@ -33,7 +33,7 @@ module VX_alu_unit #(
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam RSP_ARB_SIZE = 2 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
@@ -60,12 +60,13 @@ module VX_alu_unit #(
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire is_muldiv_op;
wire is_reduce_op;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op && ~is_reduce_op;
assign int_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
@@ -86,6 +87,31 @@ module VX_alu_unit #(
.commit_if (int_commit_if)
);
assign is_reduce_op = `INST_ALU_IS_RED(execute_if[block_idx].data.op_mod);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) red_execute_if();
assign red_execute_if.valid = execute_if[block_idx].valid && is_reduce_op;
assign red_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) red_commit_if();
`RESET_RELAY(red_reset, reset);
VX_reduce_unit #(
.CORE_ID(CORE_ID),
.NUM_LANES(NUM_LANES)
) reduce_unit (
.clk(clk),
.reset(red_reset),
.execute_if(red_execute_if),
.commit_if(red_commit_if)
);
`ifdef EXT_M_ENABLE
assign is_muldiv_op = `INST_ALU_IS_M(execute_if[block_idx].data.op_mod);
@@ -96,7 +122,7 @@ module VX_alu_unit #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op && ~is_reduce_op;
assign mdv_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
@@ -113,12 +139,12 @@ module VX_alu_unit #(
.commit_if (mdv_commit_if)
);
assign execute_if[block_idx].ready = is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready;
assign execute_if[block_idx].ready = is_reduce_op ? red_execute_if.ready : (is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready);
`else
assign is_muldiv_op = 0;
assign execute_if[block_idx].ready = int_execute_if.ready;
assign execute_if[block_idx].ready = is_reduce_op ? red_execute_if.ready : int_execute_if.ready;
`endif
@@ -135,19 +161,22 @@ module VX_alu_unit #(
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
`endif
int_commit_if.valid
int_commit_if.valid,
red_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
`endif
int_commit_if.ready
int_commit_if.ready,
red_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
`endif
int_commit_if.data
int_commit_if.data,
red_commit_if.data
}),
.data_out (commit_block_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid),

View File

@@ -505,6 +505,34 @@ module VX_decode #(
default:;
endcase
end
`INST_EXT3: begin
ex_type = `EX_ALU;
op_mod[3] = 1;
`USED_IREG(rs1);
`USED_IREG(rd);
case (func7[5:0])
6'h0: begin
op_type = func7[6] ? `INST_RED_ADDU : `INST_RED_ADD;
end
6'h1: begin
op_type = func7[6] ? `INST_RED_MINU : `INST_RED_MIN;
end
6'h2: begin
op_type = func7[6] ? `INST_RED_MAXU : `INST_RED_MAX;
end
6'h3: begin
op_type = `INST_RED_AND;
end
6'h4: begin
op_type = `INST_RED_OR;
end
6'h5: begin
op_type = `INST_RED_XOR;
end
default:;
endcase
end
default:;
endcase
end

View File

@@ -0,0 +1,283 @@
`include "VX_define.vh"
`include "VX_platform.vh"
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
module VX_reduce_ext #(
parameter DATAW_IN = 1,
parameter DATAW_OUT = DATAW_IN,
parameter N = 1
) (
input wire [N-1:0][DATAW_IN-1:0] data_in,
input wire [N-1:0] mask,
input wire [`INST_RED_BITS-1:0] op_type,
output wire [DATAW_OUT-1:0] data_out
);
if (N == 1) begin
`UNUSED_VAR(op_type)
`UNUSED_VAR(mask)
assign data_out = DATAW_OUT'(data_in[0]);
end else begin
localparam int N_A = N / 2;
localparam int N_B = N - N_A;
wire [N_A-1:0][DATAW_IN-1:0] in_A;
wire [N_B-1:0][DATAW_IN-1:0] in_B;
wire [DATAW_OUT-1:0] out_A, out_B;
wire [N_A-1:0] mask_A;
wire [N_B-1:0] mask_B;
wire any_A, any_B;
for (genvar i = 0; i < N_A; i++) begin
assign in_A[i] = data_in[i];
end
for (genvar i = 0; i < N_B; i++) begin
assign in_B[i] = data_in[N_A + i];
end
assign mask_A = mask[N_A-1:0];
assign mask_B = mask[N-1:N_A];
assign any_A = |mask_A;
assign any_B = |mask_B;
VX_reduce_ext #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_A)
) reduce_A (
.data_in (in_A),
.mask(mask_A),
.op_type(op_type),
.data_out (out_A)
);
VX_reduce_ext #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_B)
) reduce_B (
.data_in (in_B),
.mask(mask_B),
.op_type(op_type),
.data_out (out_B)
);
logic [DATAW_OUT-1:0] _data_out;
always @(*) begin
case (op_type)
`INST_RED_ADD: _data_out = out_A + out_B;
`INST_RED_ADDU: _data_out = out_A + out_B;
`INST_RED_MIN: _data_out = ($signed(out_A) < $signed(out_B)) ? out_A : out_B;
`INST_RED_MINU: _data_out = (out_A < out_B) ? out_A : out_B;
`INST_RED_MAX: _data_out = ($signed(out_A) < $signed(out_B)) ? out_B : out_A;
`INST_RED_MAXU: _data_out = (out_A < out_B) ? out_B : out_A;
`INST_RED_AND: _data_out = out_A & out_B;
`INST_RED_OR: _data_out = out_A | out_B;
`INST_RED_XOR: _data_out = out_A ^ out_B;
default: _data_out = out_A;
endcase
end
// if both sides are masked out, then it doesn't matter what we output
assign data_out = (any_A && any_B) ? _data_out : (any_A ? out_A : out_B);
end
endmodule
module VX_reduce_unit #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM(CORE_ID)
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
logic [`XLEN-1:0] accumulator, accumulator_n, reduced_accumulator;
wire [(NUM_LANES * `XLEN)-1:0] broadcasted_accumulator;
assign broadcasted_accumulator = {NUM_LANES{accumulator}};
wire eop;
wire [NUM_LANES-1:0][`XLEN-1:0] data_in;
wire [`XLEN-1:0] data_out;
assign eop = execute_if.data.eop;
assign data_in = execute_if.data.rs1_data;
logic execute_if_valid;
logic execute_if_ready;
logic commit_if_valid;
logic commit_if_ready;
wire execute_if_fire;
wire commit_if_fire;
assign execute_if_valid = execute_if.valid;
assign execute_if.ready = execute_if_ready;
assign execute_if_fire = execute_if.ready && execute_if.valid;
assign commit_if_fire = commit_if_ready && commit_if_valid;
logic store_tmask_pid;
logic read_tmask_pid;
wire [PID_WIDTH-1:0] stored_pid;
wire [NUM_LANES-1:0] stored_tmask;
wire stored_sop;
wire stored_eop;
logic [PID_BITS:0] size, size_n;
// 1. idle state - wait for execute_if to be valid
// 2. accumulate - continue accumulating until eop, store packet id + thread mask for broadcast phase
// 3. broadcast - broadcast to rds
localparam IDLE = 2'b00;
localparam ACCUMULATE = 2'b01;
localparam BROADCAST = 2'b10;
localparam FINISH = 2'b11;
logic [1:0] state, state_n;
always @(*) begin
state_n = state;
accumulator_n = accumulator;
execute_if_ready = '0;
commit_if_valid = '0;
store_tmask_pid = '0;
read_tmask_pid = '0;
size_n = store_tmask_pid ? size + 1 : (read_tmask_pid ? size - 1 : size);
case (state)
IDLE: begin
if (execute_if_valid) begin
accumulator_n = data_out;
store_tmask_pid = '1;
if (eop) begin
state_n = BROADCAST;
end
else begin
execute_if_ready = '1;
state_n = ACCUMULATE;
end
end
end
ACCUMULATE: begin
execute_if_ready = '1;
if (eop) begin
execute_if_ready = '0;
state_n = BROADCAST;
end
if (eop || execute_if_fire) begin
accumulator_n = reduced_accumulator;
store_tmask_pid = '1;
end
end
BROADCAST: begin
execute_if_ready = '0;
commit_if_valid = '1;
if (commit_if_fire) begin
read_tmask_pid = '1;
end
if (size_n == '0) begin
state_n = FINISH;
end
end
FINISH: begin
execute_if_ready = '1;
if (execute_if_fire) begin
state_n = IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
accumulator <= '0;
state <= IDLE;
size <= '0;
end
else begin
accumulator <= accumulator_n;
state <= state_n;
size <= size_n;
end
end
VX_reduce_ext #(
.DATAW_IN(`XLEN),
.N(NUM_LANES)
) reducer (
.data_in(data_in),
.mask(execute_if.data.tmask),
.op_type(execute_if.data.op_type),
.data_out(data_out)
);
VX_reduce_ext #(
.DATAW_IN(`XLEN),
.N(2)
) accumulator_reducer (
.data_in({accumulator, data_out}),
.mask(2'b11),
.op_type(execute_if.data.op_type),
.data_out(reduced_accumulator)
);
VX_elastic_buffer #(
.DATAW(NUM_LANES + PID_WIDTH + 1 + 1),
.SIZE(NUM_PACKETS),
) tmask_pid_store (
.clk(clk),
.reset(reset),
.valid_in(store_tmask_pid),
`UNUSED_PIN(ready_in),
.data_in({execute_if.data.tmask, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out({stored_tmask, stored_pid, stored_sop, stored_eop}),
.ready_out(read_tmask_pid),
`UNUSED_PIN(valid_out)
);
VX_elastic_buffer #(
.DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + PID_WIDTH + 1 + 1)
) output_buffer (
.clk(clk),
.reset(reset),
.valid_in(commit_if_valid),
.ready_in(commit_if_ready),
.data_in({execute_if.data.uuid, execute_if.data.wid, stored_tmask, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, broadcasted_accumulator, stored_pid, stored_sop, stored_eop}),
.data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.ready_out(commit_if.ready),
.valid_out(commit_if.valid)
);
endmodule

View File

@@ -1,19 +1,23 @@
all:
$(MAKE) -C conform
$(MAKE) -C hello
$(MAKE) -C fibonacci
$(MAKE) -C fibonacci
$(MAKE) -C reductions
run-simx:
$(MAKE) -C conform run-simx
$(MAKE) -C hello run-simx
$(MAKE) -C fibonacci run-simx
$(MAKE) -C reductions run-simx
run-rtlsim:
$(MAKE) -C conform run-rtlsim
$(MAKE) -C hello run-rtlsim
$(MAKE) -C fibonacci run-rtlsim
$(MAKE) -C reductions run-rtlsim
clean:
$(MAKE) -C conform clean
$(MAKE) -C hello clean
$(MAKE) -C fibonacci clean
$(MAKE) -C reductions clean

View File

@@ -0,0 +1,5 @@
PROJECT = reductions
SRCS = main.cpp
include ../common.mk

View File

@@ -0,0 +1,216 @@
#define RISCV_CUSTOM2 0x5B
#define ADD_FUNC7 0b0000000
#define ADDU_FUNC7 0b1000000
#define MIN_FUNC7 0b0000001
#define MINU_FUNC7 0b1000001
#define MAX_FUNC7 0b0000010
#define MAXU_FUNC7 0b1000010
#define AND_FUNC7 0b0000011
#define OR_FUNC7 0b0000100
#define XOR_FUNC7 0b0000101
/*
6'h0: begin
op_type = func7[6] ? `INST_RED_ADDU : `INST_RED_ADD;
end
6'h1: begin
op_type = func7[6] ? `INST_RED_MINU : `INST_RED_MIN;
end
6'h2: begin
op_type = func7[6] ? `INST_RED_MAXU : `INST_RED_MAX;
end
6'h3: begin
op_type = `INST_RED_AND;
end
6'h4: begin
op_type = `INST_RED_OR;
end
6'h5: begin
op_type = `INST_RED_XOR;
end
*/
#include <vx_intrinsics.h>
#include <stdio.h>
#include <vx_print.h>
int x[4] = {3, 7, 2, 5};
int y = -1;
inline int vx_add_reduce(int v) {
int ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(ADD_FUNC7));
return ret;
}
inline int vx_min_reduce(int v) {
int ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(MIN_FUNC7));
return ret;
}
inline unsigned vx_minu_reduce(unsigned v) {
unsigned ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(MINU_FUNC7));
return ret;
}
inline int vx_max_reduce(int v) {
int ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(MAX_FUNC7));
return ret;
}
inline unsigned vx_maxu_reduce(unsigned v) {
unsigned ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(MAXU_FUNC7));
return ret;
}
inline unsigned vx_and_reduce(unsigned v) {
unsigned ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(AND_FUNC7));
return ret;
}
inline unsigned vx_or_reduce(unsigned v) {
unsigned ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(OR_FUNC7));
return ret;
}
inline unsigned vx_xor_reduce(unsigned v) {
unsigned ret;
asm volatile (".insn r %2, 0, %3, %0, %1, x0" : "=r"(ret) : "r"(v), "i"(RISCV_CUSTOM2), "i"(XOR_FUNC7));
return ret;
}
void test_add_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
int v = x[tid];
int reduced = vx_add_reduce(v);
vx_tmc(1);
y = reduced;
}
unsigned unsigned_vector[4] = {(unsigned)-1, 0, (unsigned)-2, 5};
void test_min_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
int v = unsigned_vector[tid];
int reduced = vx_min_reduce(v);
vx_tmc(1);
y = reduced;
}
void test_max_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
int v = unsigned_vector[tid];
int reduced = vx_max_reduce(v);
vx_tmc(1);
y = reduced;
}
void test_minu_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
unsigned v = unsigned_vector[tid];
unsigned reduced = vx_minu_reduce(v);
vx_tmc(1);
y = reduced;
}
void test_maxu_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
unsigned v = unsigned_vector[tid];
unsigned reduced = vx_maxu_reduce(v);
vx_tmc(1);
y = reduced;
}
unsigned bit_vectors[4] = {0b11010110000111001100010100100110, 0b10010100011010001010000000001110, 0b10001001010111110001110000000010, 0b00010011010100101101110111001111};
void test_and_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
unsigned v = bit_vectors[tid];
unsigned reduced = vx_and_reduce(v);
vx_tmc(1);
y = reduced;
}
void test_or_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
unsigned v = bit_vectors[tid];
unsigned reduced = vx_or_reduce(v);
vx_tmc(1);
y = reduced;
}
void test_xor_reduce() {
vx_tmc(-1);
int tid = vx_thread_id();
unsigned v = bit_vectors[tid];
unsigned reduced = vx_xor_reduce(v);
vx_tmc(1);
y = reduced;
}
int main()
{
int expected;
test_add_reduce();
vx_printf("add reduce result: %d\n", y);
vx_printf("expected: %d\n", x[0] + x[1] + x[2] + x[3]);
test_min_reduce();
vx_printf("min reduce result: %d\n", y);
expected = MIN((int)unsigned_vector[0], MIN((int)unsigned_vector[1], MIN((int)unsigned_vector[2], (int)unsigned_vector[3])));
vx_printf("expected: %d\n", expected);
test_max_reduce();
vx_printf("max reduce result: %d\n", y);
expected = MAX((int)unsigned_vector[0], MAX((int)unsigned_vector[1], MAX((int)unsigned_vector[2], (int)unsigned_vector[3])));
vx_printf("expected: %d\n", expected);
test_minu_reduce();
vx_printf("minu reduce result: %d\n", y);
expected = MIN(unsigned_vector[0], MIN(unsigned_vector[1], MIN(unsigned_vector[2], unsigned_vector[3])));
vx_printf("expected: %d\n", expected);
test_maxu_reduce();
vx_printf("maxu reduce result: %d\n", y);
expected = MAX(unsigned_vector[0], MAX(unsigned_vector[1], MAX(unsigned_vector[2], unsigned_vector[3])));
vx_printf("expected: %d\n", expected);
test_and_reduce();
vx_printf("and reduce result: %d\n", y);
vx_printf("expected: %d\n", bit_vectors[0] & bit_vectors[1] & bit_vectors[2] & bit_vectors[3]);
test_or_reduce();
vx_printf("or reduce result: %d\n", y);
vx_printf("expected: %d\n", bit_vectors[0] | bit_vectors[1] | bit_vectors[2] | bit_vectors[3]);
test_xor_reduce();
vx_printf("xor reduce result: %d\n", y);
vx_printf("expected: %d\n", bit_vectors[0] ^ bit_vectors[1] ^ bit_vectors[2] ^ bit_vectors[3]);
return 0;
}