fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,

This commit is contained in:
Blaise Tine
2021-11-24 00:00:17 -05:00
parent 1501360f4b
commit 18762dffce
70 changed files with 3818 additions and 1727 deletions

View File

@@ -28,7 +28,7 @@ echo "begin texture tests..."
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2"
echo "coverage texture done!" echo "coverage texture done!"
} }

View File

@@ -116,9 +116,11 @@ public:
} }
int start() { int start() {
// ensure prior run completed
if (future_.valid()) { if (future_.valid()) {
future_.wait(); // ensure prior run completed future_.wait();
} }
// start new run
simulator_.attach_ram(&ram_); simulator_.attach_ram(&ram_);
future_ = std::async(std::launch::async, [&]{ future_ = std::async(std::launch::async, [&]{
simulator_.reset(); simulator_.reset();
@@ -135,7 +137,8 @@ public:
uint64_t timeout_sec = timeout / 1000; uint64_t timeout_sec = timeout / 1000;
std::chrono::seconds wait_time(1); std::chrono::seconds wait_time(1);
for (;;) { for (;;) {
auto status = future_.wait_for(wait_time); // wait for 1 sec and check status // wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready if (status == std::future_status::ready
|| 0 == timeout_sec--) || 0 == timeout_sec--)
break; break;

View File

@@ -3,8 +3,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <assert.h> #include <assert.h>
#include <iostream> #include <iostream>
#include <thread> #include <future>
#include <mutex>
#include <chrono> #include <chrono>
#include <vortex.h> #include <vortex.h>
@@ -60,18 +59,14 @@ class vx_device {
public: public:
vx_device() vx_device()
: arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS)
, is_done_(false)
, is_running_(false)
, mem_allocation_(ALLOC_BASE_ADDR)
, thread_(__thread_proc__, this)
, ram_(RAM_PAGE_SIZE) , ram_(RAM_PAGE_SIZE)
, mem_allocation_(ALLOC_BASE_ADDR)
{} {}
~vx_device() { ~vx_device() {
mutex_.lock(); if (future_.valid()) {
is_done_ = true; future_.wait();
mutex_.unlock(); }
thread_.join();
} }
int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) {
@@ -115,72 +110,41 @@ public:
} }
int start() { int start() {
mutex_.lock(); // ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// start new run
SimPlatform::instance().flush(); SimPlatform::instance().flush();
processor_ = std::make_shared<Processor>(arch_); processor_ = std::make_shared<Processor>(arch_);
processor_->attach_ram(&ram_); processor_->attach_ram(&ram_);
is_running_ = true; future_ = std::async(std::launch::async, [&]{
mutex_.unlock(); processor_->run();
});
return 0; return 0;
} }
int wait(uint64_t timeout) { int wait(uint64_t timeout) {
if (!future_.valid())
return 0;
uint64_t timeout_sec = timeout / 1000; uint64_t timeout_sec = timeout / 1000;
std::chrono::seconds wait_time(1);
for (;;) { for (;;) {
mutex_.lock(); // wait for 1 sec and check status
bool is_running = is_running_; auto status = future_.wait_for(wait_time);
mutex_.unlock(); if (status == std::future_status::ready
|| 0 == timeout_sec--)
if (!is_running || 0 == timeout_sec--)
break; break;
std::this_thread::sleep_for(std::chrono::seconds(1));
} }
return 0; return 0;
} }
private: private:
void thread_proc() {
std::cout << "Device ready..." << std::flush << std::endl;
for (;;) {
mutex_.lock();
bool is_done = is_done_;
bool is_running = is_running_;
mutex_.unlock();
if (is_done)
break;
if (is_running) {
std::cout << "Device running..." << std::flush << std::endl;
processor_->run();
mutex_.lock();
is_running_ = false;
mutex_.unlock();
std::cout << "Device ready..." << std::flush << std::endl;
}
}
std::cout << "Device shutdown..." << std::flush << std::endl;
}
static void __thread_proc__(vx_device* device) {
device->thread_proc();
}
ArchDef arch_; ArchDef arch_;
Processor::Ptr processor_;
bool is_done_;
bool is_running_;
uint64_t mem_allocation_;
std::thread thread_;
RAM ram_; RAM ram_;
std::mutex mutex_; Processor::Ptr processor_;
uint64_t mem_allocation_;
std::future<void> future_;
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@@ -236,18 +236,30 @@
////////// Texture Units ////////////////////////////////////////////////////// ////////// Texture Units //////////////////////////////////////////////////////
`define NUM_TEX_UNITS 2 `define NUM_TEX_UNITS 2
`define TEX_SUBPIXEL_BITS 8
`define CSR_TEX_STATES 7 `define TEX_DIM_BITS 15
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) `define TEX_LOD_MAX `TEX_DIM_BITS
`define TEX_LOD_BITS 4
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) `define TEX_FXD_BITS 32
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) `define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS)
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) `define TEX_STATE_ADDR 0
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) `define TEX_STATE_WIDTH 1
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) `define TEX_STATE_HEIGHT 2
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) `define TEX_STATE_FORMAT 3
`define TEX_STATE_FILTER 4
`define TEX_STATE_WRAPU 5
`define TEX_STATE_WRAPV 6
`define TEX_STATE_MIPOFF(lod) (7+(lod))
`define NUM_TEX_STATES (7+`TEX_LOD_MAX)
`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state))
`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES)
`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES)
// Pipeline Queues //////////////////////////////////////////////////////////// // Pipeline Queues ////////////////////////////////////////////////////////////
@@ -266,6 +278,11 @@
`define FPUQ_SIZE 8 `define FPUQ_SIZE 8
`endif `endif
// Texture Unit Request Queue
`ifndef TEXQ_SIZE
`define TEXQ_SIZE (`NUM_WARPS * 2)
`endif
// Icache Configurable Knobs ////////////////////////////////////////////////// // Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes // Size of cache in bytes

View File

@@ -50,35 +50,40 @@ module VX_csr_data #(
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
always @(posedge clk) begin always @(posedge clk) begin
`ifdef EXT_F_ENABLE
if (reset) begin if (reset) begin
fcsr <= '0; fcsr <= '0;
end end else begin
if (fpu_to_csr_if.write_enable) begin `ifdef EXT_F_ENABLE
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] if (fpu_to_csr_if.write_enable) begin
| fpu_to_csr_if.write_fflags; fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
end | fpu_to_csr_if.write_fflags;
`endif end
if (write_enable) begin `endif
case (write_addr) if (write_enable) begin
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; case (write_addr)
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
default: begin `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
`ASSERT(write_addr >= `CSR_TEX_BEGIN(0) default: begin
&& write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES), `ifdef EXT_TEX_ENABLE
("%t: invalid CSR write address: %0h", $time, write_addr)); `ASSERT(write_addr >= `CSR_TEX(0,0)
end && write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0),
endcase ("%t: invalid CSR write address: %0h", $time, write_addr));
`else
`ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
`endif
end
endcase
end
end end
end end
@@ -217,11 +222,16 @@ module VX_csr_data #(
`CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID; `CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID;
default: begin default: begin
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin read_addr_valid_r = 1;
end else
`ifdef EXT_TEX_ENABLE
if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin
read_addr_valid_r = 1;
end else
`endif
read_addr_valid_r = 0; read_addr_valid_r = 0;
end
end end
endcase endcase
end end

View File

@@ -214,9 +214,9 @@ module VX_decode #(
case (u_12) case (u_12)
12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL); 12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK); 12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET); 12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET);
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET);
default:; default:;
endcase endcase
op_mod = 1; op_mod = 1;
@@ -347,7 +347,7 @@ module VX_decode #(
endcase endcase
end end
`endif `endif
`INST_GPU: begin `INST_GPGPU: begin
ex_type = `EX_GPU; ex_type = `EX_GPU;
case (func3) case (func3)
3'h0: begin 3'h0: begin
@@ -374,9 +374,21 @@ module VX_decode #(
is_wstall = 1; is_wstall = 1;
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
end end
`ifdef EXT_TEX_ENABLE
3'h5: begin 3'h5: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
end
default:;
endcase
end
`INST_GPU: begin
case (func3)
`ifdef EXT_TEX_ENABLE
3'h0: begin
ex_type = `EX_GPU;
op_type = `INST_OP_BITS'(`INST_GPU_TEX); op_type = `INST_OP_BITS'(`INST_GPU_TEX);
op_mod = `INST_MOD_BITS'(func2); op_mod = `INST_MOD_BITS'(func2);
use_rd = 1; use_rd = 1;
@@ -386,12 +398,6 @@ module VX_decode #(
`USED_IREG (rs3); `USED_IREG (rs3);
end end
`endif `endif
3'h6: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
end
default:; default:;
endcase endcase
end end

View File

@@ -66,7 +66,8 @@
`define INST_FNMADD 7'b1001111 `define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions `define INST_FCI 7'b1010011 // float common instructions
`define INST_GPU 7'b1101011 `define INST_GPGPU 7'b1101011
`define INST_GPU 7'b1011011
`define INST_TEX 7'b0101011 `define INST_TEX 7'b0101011
@@ -117,9 +118,9 @@
`define INST_BR_JALR 4'b1001 `define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010 `define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011 `define INST_BR_EBREAK 4'b1011
`define INST_BR_MRET 4'b1100 `define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101 `define INST_BR_SRET 4'b1101
`define INST_BR_DRET 4'b1110 `define INST_BR_MRET 4'b1110
`define INST_BR_OTHER 4'b1111 `define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4 `define INST_BR_BITS 4
`define INST_BR_NEG(x) x[1] `define INST_BR_NEG(x) x[1]
@@ -185,14 +186,14 @@
`define INST_FPU_NMADD 4'hF `define INST_FPU_NMADD 4'hF
`define INST_FPU_BITS 4 `define INST_FPU_BITS 4
`define INST_GPU_TMC 3'h0 `define INST_GPU_TMC 4'h0
`define INST_GPU_WSPAWN 3'h1 `define INST_GPU_WSPAWN 4'h1
`define INST_GPU_SPLIT 3'h2 `define INST_GPU_SPLIT 4'h2
`define INST_GPU_JOIN 3'h3 `define INST_GPU_JOIN 4'h3
`define INST_GPU_BAR 3'h4 `define INST_GPU_BAR 4'h4
`define INST_GPU_PRED 3'h5 `define INST_GPU_PRED 4'h5
`define INST_GPU_TEX 3'h6 `define INST_GPU_TEX 4'h6
`define INST_GPU_BITS 3 `define INST_GPU_BITS 4
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -237,11 +238,9 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`ifdef DBG_CACHE_REQ_INFO // wid PC // cache request identifier
`define DBG_CACHE_REQ_MDATAW (`NW_BITS + 32) `define DBG_CACHE_REQ_IDW 48
`else `define DBG_CACHE_REQ_ID(type, ctr) {4'(type), {`DBG_CACHE_REQ_IDW-4{1'b0}}} + ctr
`define DBG_CACHE_REQ_MDATAW 0
`endif
// non-cacheable tag bits // non-cacheable tag bits
`define NC_TAG_BIT 1 `define NC_TAG_BIT 1
@@ -249,6 +248,9 @@
// texture tag bits // texture tag bits
`define TEX_TAG_BIT 1 `define TEX_TAG_BIT 1
// cache address type bits
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE)
////////////////////////// Icache Configurable Knobs ////////////////////////// ////////////////////////// Icache Configurable Knobs //////////////////////////
// Cache ID // Cache ID
@@ -264,7 +266,7 @@
`define ICACHE_CORE_TAG_ID_BITS `NW_BITS `define ICACHE_CORE_TAG_ID_BITS `NW_BITS
// Core request tag bits // Core request tag bits
`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS) `define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `ICACHE_CORE_TAG_ID_BITS)
// Memory request data bits // Memory request data bits
`define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8) `define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
@@ -289,17 +291,14 @@
// Core request tag bits // Core request tag bits
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
`ifdef EXT_TEX_ENABLE `ifdef EXT_TEX_ENABLE
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) `define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2)
`define TEX_TAG_ID_BITS (2) `define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_IDW + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) `define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT)
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT)
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
`else `else
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) `define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
`endif `endif
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) `define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `DCACHE_CORE_TAG_ID_BITS)
// Memory request data bits // Memory request data bits
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)

View File

@@ -52,51 +52,29 @@ module VX_execute #(
VX_dcache_req_if #( VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS), .NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4), .WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS) .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) lsu_dcache_req_if(); ) lsu_dcache_req_if();
VX_dcache_rsp_if #( VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS), .NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4), .WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS) .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) lsu_dcache_rsp_if(); ) lsu_dcache_rsp_if();
VX_dcache_req_if #( VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS), .NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4), .WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS) .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) tex_dcache_req_if(); ) tex_dcache_req_if();
VX_dcache_rsp_if #( VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS), .NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4), .WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS) .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) tex_dcache_rsp_if(); ) tex_dcache_rsp_if();
VX_tex_csr_if tex_csr_if(); VX_tex_csr_if tex_csr_if();
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in;
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out;
`UNUSED_VAR (tex_tag_out)
`UNUSED_VAR (lsu_tag_out)
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]);
assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]);
`ifdef DBG_CACHE_REQ_INFO
assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS];
assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS];
`endif
end
assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0];
assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0];
`ifdef DBG_CACHE_REQ_INFO
assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
`endif
VX_cache_arb #( VX_cache_arb #(
.NUM_REQS (2), .NUM_REQS (2),
.LANES (`NUM_THREADS), .LANES (`NUM_THREADS),
@@ -113,7 +91,7 @@ module VX_execute #(
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
.req_tag_in ({tex_tag_in, lsu_tag_in}), .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}),
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
// Dcache request // Dcache request
@@ -136,7 +114,7 @@ module VX_execute #(
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
.rsp_tag_out ({tex_tag_out, lsu_tag_out}), .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}),
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
); );

View File

@@ -24,10 +24,17 @@ module VX_icache_stage #(
localparam OUT_REG = 0; localparam OUT_REG = 0;
reg [`DBG_CACHE_REQ_IDW-1:0] req_id;
wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id;
wire [`NW_BITS-1:0] req_tag, rsp_tag;
`UNUSED_VAR (rsp_req_id)
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid; assign req_tag = ifetch_req_if.wid;
wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW];
wire [31:0] rsp_PC; wire [31:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask; wire [`NUM_THREADS-1:0] rsp_tmask;
@@ -51,16 +58,21 @@ module VX_icache_stage #(
// Icache Request // Icache Request
assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.valid = ifetch_req_if.valid;
assign icache_req_if.addr = ifetch_req_if.PC[31:2]; assign icache_req_if.addr = ifetch_req_if.PC[31:2];
assign icache_req_if.tag = {req_id, req_tag};
always @(posedge clk) begin
if (reset) begin
req_id <= `DBG_CACHE_REQ_ID(0, 0);
end else begin
if (icache_req_fire) begin
req_id <= req_id + 1;
end
end
end
// Can accept new request? // Can accept new request?
assign ifetch_req_if.ready = icache_req_if.ready; assign ifetch_req_if.ready = icache_req_if.ready;
`ifdef DBG_CACHE_REQ_INFO
assign icache_req_if.tag = {ifetch_req_if.wid, ifetch_req_if.PC, req_tag};
`else
assign icache_req_if.tag = req_tag;
`endif
wire [`NW_BITS-1:0] rsp_wid = rsp_tag; wire [`NW_BITS-1:0] rsp_wid = rsp_tag;
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
@@ -90,11 +102,11 @@ module VX_icache_stage #(
`ifdef DBG_TRACE_CORE_ICACHE `ifdef DBG_TRACE_CORE_ICACHE
always @(posedge clk) begin always @(posedge clk) begin
if (icache_req_if.valid && icache_req_if.ready) begin if (icache_req_fire) begin
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id);
end end
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data); dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data);
end end
end end
`endif `endif

View File

@@ -24,8 +24,6 @@ module VX_lsu_unit #(
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
@@ -44,7 +42,7 @@ module VX_lsu_unit #(
wire mbuf_empty; wire mbuf_empty;
wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type; wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
wire [`NUM_THREADS-1:0][31:0] full_addr; wire [`NUM_THREADS-1:0][31:0] full_addr;
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
@@ -83,7 +81,7 @@ module VX_lsu_unit #(
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1) .RESETW (1)
) req_pipe_reg ( ) req_pipe_reg (
.clk (clk), .clk (clk),
@@ -104,19 +102,22 @@ module VX_lsu_unit #(
wire rsp_is_dup; wire rsp_is_dup;
wire rsp_is_prefetch; wire rsp_is_prefetch;
`UNUSED_VAR (rsp_type)
`UNUSED_VAR (rsp_is_prefetch)
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
wire [`NUM_THREADS-1:0] rsp_tmask; wire [`NUM_THREADS-1:0] rsp_tmask;
reg [`DBG_CACHE_REQ_IDW-1:0] req_id;
wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id;
reg [`NUM_THREADS-1:0] req_sent_mask; reg [`NUM_THREADS-1:0] req_sent_mask;
reg is_req_start; reg is_req_start;
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
wire mbuf_full; wire mbuf_full;
`UNUSED_VAR (rsp_type)
`UNUSED_VAR (rsp_is_prefetch)
`UNUSED_VAR (rsp_req_id)
wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset;
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign req_offset[i] = req_addr[i][1:0]; assign req_offset[i] = req_addr[i][1:0];
@@ -124,6 +125,8 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
@@ -135,7 +138,8 @@ module VX_lsu_unit #(
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS];
assign rsp_req_id = dcache_rsp_if.tag[(`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS) +: `DBG_CACHE_REQ_IDW];
`UNUSED_VAR (dcache_rsp_if.tag) `UNUSED_VAR (dcache_rsp_if.tag)
// do not writeback from software prefetch // do not writeback from software prefetch
@@ -214,7 +218,7 @@ module VX_lsu_unit #(
0: mem_req_byteen[req_offset[i]] = 1; 0: mem_req_byteen[req_offset[i]] = 1;
1: begin 1: begin
mem_req_byteen[req_offset[i]] = 1; mem_req_byteen[req_offset[i]] = 1;
mem_req_byteen[{req_addr[i][1], 1'b1}] = 1; mem_req_byteen[{req_offset[i][1], 1'b1}] = 1;
end end
default : mem_req_byteen = {4{1'b1}}; default : mem_req_byteen = {4{1'b1}};
endcase endcase
@@ -235,12 +239,17 @@ module VX_lsu_unit #(
assign dcache_req_if.addr[i] = req_addr[i][31:2]; assign dcache_req_if.addr[i] = req_addr[i][31:2];
assign dcache_req_if.byteen[i] = mem_req_byteen; assign dcache_req_if.byteen[i] = mem_req_byteen;
assign dcache_req_if.data[i] = mem_req_data; assign dcache_req_if.data[i] = mem_req_data;
assign dcache_req_if.tag[i] = {req_id, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]};
end
`ifdef DBG_CACHE_REQ_INFO always @(posedge clk) begin
assign dcache_req_if.tag[i] = {req_wid, req_pc, req_tag, req_addr_type[i]}; if (reset) begin
`else req_id <= `DBG_CACHE_REQ_ID(1, 0);
assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]}; end else begin
`endif if (dcache_req_fire_any) begin
req_id <= req_id + 1;
end
end
end end
assign ready_in = req_dep_ready && dcache_req_ready; assign ready_in = req_dep_ready && dcache_req_ready;
@@ -339,22 +348,21 @@ module VX_lsu_unit #(
`endif `endif
`ifdef DBG_TRACE_CORE_DCACHE `ifdef DBG_TRACE_CORE_DCACHE
wire dcache_req_fire_any = (| dcache_req_fire);
always @(posedge clk) begin always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin if (lsu_req_if.valid && fence_wait) begin
dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID); dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID);
end end
if (dcache_req_fire_any) begin if (dcache_req_fire_any) begin
if (dcache_req_if.rw[0]) begin if (dcache_req_if.rw[0]) begin
dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_id);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS); `TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
dpi_trace(", data="); dpi_trace(", data=");
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
dpi_trace("\n"); dpi_trace(", req_id=%0h\n", req_id);
end else begin end else begin
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS); `TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
@@ -362,8 +370,8 @@ module VX_lsu_unit #(
end end
end end
if (dcache_rsp_fire) begin if (dcache_rsp_fire) begin
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=",
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd);
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
dpi_trace(", is_dup=%b\n", rsp_is_dup); dpi_trace(", is_dup=%b\n", rsp_is_dup);
end end

View File

@@ -33,9 +33,6 @@ module VX_bank #(
// core request tag size // core request tag size
parameter CORE_TAG_WIDTH = 1, parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
// bank offset from beginning of index range // bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0, parameter BANK_ADDR_OFFSET = 0,
@@ -96,14 +93,9 @@ module VX_bank #(
input wire [`LINE_SELECT_BITS-1:0] flush_addr input wire [`LINE_SELECT_BITS-1:0] flush_addr
); );
`UNUSED_PARAM (CORE_TAG_ID_BITS)
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1; wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1;
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
`endif
wire [NUM_PORTS-1:0] creq_pmask; wire [NUM_PORTS-1:0] creq_pmask;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel;
@@ -197,13 +189,7 @@ module VX_bank #(
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire creq_fire = creq_valid && creq_ready; wire creq_fire = creq_valid && creq_ready;
`ifdef DBG_CACHE_REQ_INFO assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG];
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_sel, debug_pc_sel} = 0;
end
`endif
wire [`CACHE_LINE_WIDTH-1:0] wdata_sel; wire [`CACHE_LINE_WIDTH-1:0] wdata_sel;
assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data; assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data;
@@ -237,13 +223,7 @@ module VX_bank #(
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) .data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0})
); );
`ifdef DBG_CACHE_REQ_INFO assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG];
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st0, debug_pc_st0} = 0;
end
`endif
wire do_fill_st0 = valid_st0 && is_fill_st0; wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0; wire do_flush_st0 = valid_st0 && is_flush_st0;
@@ -263,11 +243,9 @@ module VX_bank #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef DBG_CACHE_REQ_INFO .req_id (req_id_st0),
.debug_pc (debug_pc_st0),
.debug_wid (debug_wid_st0), .stall (crsq_stall),
`endif
.stall (crsq_stall),
// read/Fill // read/Fill
.lookup (do_lookup_st0), .lookup (do_lookup_st0),
@@ -293,13 +271,7 @@ module VX_bank #(
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) .data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1})
); );
`ifdef DBG_CACHE_REQ_INFO assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG];
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_wid_st1, debug_pc_st1} = 0;
end
`endif
wire do_read_st0 = valid_st0 && is_read_st0; wire do_read_st0 = valid_st0 && is_read_st0;
wire do_read_st1 = valid_st1 && is_read_st1; wire do_read_st1 = valid_st1 && is_read_st1;
@@ -323,10 +295,8 @@ module VX_bank #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef DBG_CACHE_REQ_INFO .req_id (req_id_st1),
.debug_pc (debug_pc_st1),
.debug_wid (debug_wid_st1),
`endif
.stall (crsq_stall), .stall (crsq_stall),
.read (do_read_st1 || do_mshr_st1), .read (do_read_st1 || do_mshr_st1),
@@ -372,14 +342,9 @@ module VX_bank #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef DBG_CACHE_REQ_INFO .deq_req_id (req_id_sel),
.deq_debug_pc (debug_pc_sel), .lkp_req_id (req_id_st0),
.deq_debug_wid (debug_wid_sel), .rel_req_id (req_id_st1),
.lkp_debug_pc (debug_pc_st0),
.lkp_debug_wid (debug_wid_st0),
.rel_debug_pc (debug_pc_st1),
.rel_debug_wid (debug_wid_st1),
`endif
// allocate // allocate
.allocate_valid (mshr_allocate), .allocate_valid (mshr_allocate),
@@ -525,22 +490,22 @@ module VX_bank #(
dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data); dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data);
end end
if (mshr_fire) begin if (mshr_fire) begin
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel); dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel);
end end
if (creq_fire) begin if (creq_fire) begin
if (creq_rw) if (creq_rw)
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel);
else else
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel);
end end
if (crsq_fire) begin if (crsq_fire) begin
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1);
end end
if (mreq_push) begin if (mreq_push) begin
if (is_write_st1) if (is_write_st1)
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1);
else else
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1);
end end
end end
`endif `endif

View File

@@ -580,8 +580,7 @@ module VX_cache #(
.MSHR_SIZE (MSHR_SIZE), .MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE), .MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH),
.CORE_TAG_ID_BITS (CORE_TAG_ID_X_BITS),
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
) bank ( ) bank (
`SCOPE_BIND_VX_cache_bank(i) `SCOPE_BIND_VX_cache_bank(i)

View File

@@ -3,9 +3,8 @@
`include "VX_platform.vh" `include "VX_platform.vh"
`ifdef DBG_CACHE_REQ_INFO // cache request identifier
`include "VX_define.vh" `define DBG_CACHE_REQ_IDW 48
`endif
`define REQS_BITS `LOG2UP(NUM_REQS) `define REQS_BITS `LOG2UP(NUM_REQS)
@@ -52,7 +51,7 @@
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW) `define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@@ -21,12 +21,9 @@ module VX_data_access #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
input wire[31:0] debug_pc, input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
`endif
input wire stall, input wire stall,
@@ -125,10 +122,10 @@ module VX_data_access #(
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);
end end
if (read && ~stall) begin if (read && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data); dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, req_id=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, read_data);
end end
if (write && ~stall) begin if (write && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data); dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, req_id=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, byteen, line_addr, write_data);
end end
end end
`endif `endif

View File

@@ -25,16 +25,11 @@ module VX_miss_resrv #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
input wire[31:0] deq_debug_pc, input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id,
input wire[`NW_BITS-1:0] deq_debug_wid, input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id,
input wire[31:0] lkp_debug_pc, input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id,
input wire[`NW_BITS-1:0] lkp_debug_wid,
input wire[31:0] rel_debug_pc,
input wire[`NW_BITS-1:0] rel_debug_wid,
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
`endif
// allocate // allocate
input wire allocate_valid, input wire allocate_valid,
@@ -206,23 +201,22 @@ module VX_miss_resrv #(
always @(posedge clk) begin always @(posedge clk) begin
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
if (allocate_fire) if (allocate_fire)
dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc); `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id);
if (fill_valid) if (fill_valid)
dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID)); `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID));
if (dequeue_fire) if (dequeue_fire)
dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc); `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id);
if (lookup_replay) if (lookup_replay)
dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id); `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id);
if (lookup_valid) if (lookup_valid)
dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc); `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id);
if (release_valid) if (release_valid)
dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id);
release_id, rel_debug_wid, rel_debug_pc);
dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID); dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID);
for (integer i = 0; i < MSHR_SIZE; ++i) begin for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin if (valid_table[i]) begin

View File

@@ -254,22 +254,19 @@ module VX_shared_mem #(
.ready_out (core_rsp_ready) .ready_out (core_rsp_ready)
); );
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1; wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1;
wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1;
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
for (genvar i = 0; i < NUM_BANKS; ++i) begin for (genvar i = 0; i < NUM_BANKS; ++i) begin
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_wid_st0[i], debug_pc_st0[i]} = per_bank_core_req_tag_unqual[i][`CACHE_REQ_INFO_RNG]; assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG];
assign {debug_wid_st1[i], debug_pc_st1[i]} = per_bank_core_req_tag[i][`CACHE_REQ_INFO_RNG]; assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG];
end else begin end else begin
assign {debug_wid_st0[i], debug_pc_st0[i]} = 0; assign req_id_st0[i] = 0;
assign {debug_wid_st1[i], debug_pc_st1[i]} = 0; assign req_id_st1[i] = 0;
end end
end end
`endif
`ifdef DBG_TRACE_CACHE_BANK `ifdef DBG_TRACE_CACHE_BANK
@@ -309,11 +306,11 @@ module VX_shared_mem #(
for (integer i = 0; i < NUM_BANKS; ++i) begin for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_unqual[i]) begin if (per_bank_core_req_valid_unqual[i]) begin
if (per_bank_core_req_rw_unqual[i]) begin if (per_bank_core_req_rw_unqual[i]) begin
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, req_id=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]);
end else begin end else begin
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n", dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h, req_id=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]);
end end
end end
end end
@@ -322,11 +319,11 @@ module VX_shared_mem #(
for (integer i = 0; i < NUM_BANKS; ++i) begin for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]) begin if (per_bank_core_req_valid[i]) begin
if (per_bank_core_req_rw[i]) begin if (per_bank_core_req_rw[i]) begin
dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]); $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]);
end else begin end else begin
dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]); $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]);
end end
end end
end end

View File

@@ -17,12 +17,9 @@ module VX_tag_access #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
input wire[31:0] debug_pc, input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
input wire[`NW_BITS-1:0] debug_wid,
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
`endif
input wire stall, input wire stall,
@@ -71,9 +68,9 @@ module VX_tag_access #(
end end
if (lookup && ~stall) begin if (lookup && ~stall) begin
if (tag_match) begin if (tag_match) begin
dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag); dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag);
end else begin end else begin
dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag); dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag, read_tag);
end end
end end
end end

View File

@@ -12,13 +12,13 @@ module VX_tex_addr #(
input wire req_valid, input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask, input wire [NUM_REQS-1:0] req_tmask,
input wire [1:0][NUM_REQS-1:0][31:0] req_coords, input wire [1:0][NUM_REQS-1:0][`TEX_FXD_BITS-1:0] req_coords,
input wire [`TEX_FORMAT_BITS-1:0] req_format, input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims, input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims,
input wire [REQ_INFOW-1:0] req_info, input wire [REQ_INFOW-1:0] req_info,
output wire req_ready, output wire req_ready,
@@ -27,31 +27,33 @@ module VX_tex_addr #(
output wire rsp_valid, output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask, output wire [NUM_REQS-1:0] rsp_tmask,
output wire [`TEX_FILTER_BITS-1:0] rsp_filter, output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends, output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends,
output wire [REQ_INFOW-1:0] rsp_info, output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready input wire rsp_ready
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1; localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1);
localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS; localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1;
localparam SCALED_X_W = (2 * `FIXED_INT); localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC;
localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS; localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
wire valid_s0; wire valid_s0;
wire [NUM_REQS-1:0] tmask_s0; wire [NUM_REQS-1:0] tmask_s0;
wire [`TEX_FILTER_BITS-1:0] filter_s0; wire [`TEX_FILTER_BITS-1:0] filter_s0;
wire [REQ_INFOW-1:0] req_info_s0; wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_lo, clamped_lo_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_hi, clamped_hi_s0;
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; wire [NUM_REQS-1:0][1:0][SHIFT_BITS-1:0] dim_shift, dim_shift_s0;
wire [`TEX_LGSTRIDE_BITS-1:0] log_stride, log_stride_s0;
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
wire stall_out; wire stall_out;
// stride // stride
@@ -67,9 +69,9 @@ module VX_tex_addr #(
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin for (genvar j = 0; j < 2; ++j) begin
wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]); wire [`TEX_FXD_FRAC-1:0] delta = (`TEX_FXD_HALF >> req_logdims[i][j]);
wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i]; wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i];
wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i]; wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i];
VX_tex_wrap #( VX_tex_wrap #(
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
@@ -86,66 +88,72 @@ module VX_tex_addr #(
.coord_i (coord_hi), .coord_i (coord_hi),
.coord_o (clamped_hi[i][j]) .coord_o (clamped_hi[i][j])
); );
assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - req_logdims[i][j]);
end end
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride);
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]);
end end
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + 32 + 2 * 2 * `TEX_FXD_FRAC)),
.RESETW (1) .RESETW (1)
) pipe_reg0 ( ) pipe_reg0 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall_out), .enable (~stall_out),
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}), .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, dim_shift, mip_addr, clamped_lo, clamped_hi}),
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, dim_shift_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
); );
// addresses generation // addresses generation
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo; wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_lo;
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi; wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_hi;
wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_lo;
wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi;
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo;
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi;
wire [NUM_REQS-1:0][31:0] base_addr_lo;
wire [NUM_REQS-1:0][31:0] base_addr_hi;
wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends;
wire [NUM_REQS-1:0][3:0][31:0] addr; wire [NUM_REQS-1:0][3:0][31:0] addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin for (genvar j = 0; j < 2; ++j) begin
assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]); assign scaled_lo[i][j] = SCALED_X_W'(clamped_lo_s0[i][j] >> dim_shift_s0[i][j]);
assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]); assign scaled_hi[i][j] = SCALED_X_W'(clamped_hi_s0[i][j] >> dim_shift_s0[i][j]);
assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); assign blends[i][j] = filter_s0 ? scaled_lo[i][j][`TEX_BLEND_FRAC-1:0] : `TEX_BLEND_FRAC'(0);
end end
end end
`UNUSED_VAR (log_pitch_s0)
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0; assign offset_u_lo[i] = OFFSET_U_W'(scaled_lo[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0;
wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0; assign offset_u_hi[i] = OFFSET_U_W'(scaled_hi[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0;
wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i]; assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i]; assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo); assign base_addr_lo[i] = mip_addr_s0[i] + 32'(offset_v_lo[i]);
wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi); assign base_addr_hi[i] = mip_addr_s0[i] + 32'(offset_v_hi[i]);
assign addr[i][0] = base_addr_lo + 32'(offset_u_lo); assign addr[i][0] = base_addr_lo[i] + 32'(offset_u_lo[i]);
assign addr[i][1] = base_addr_lo + 32'(offset_u_hi); assign addr[i][1] = base_addr_lo[i] + 32'(offset_u_hi[i]);
assign addr[i][2] = base_addr_hi + 32'(offset_u_lo); assign addr[i][2] = base_addr_hi[i] + 32'(offset_u_lo[i]);
assign addr[i][3] = base_addr_hi + 32'(offset_u_hi); assign addr[i][3] = base_addr_hi[i] + 32'(offset_u_hi[i]);
end end
assign stall_out = rsp_valid && ~rsp_ready; assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW), .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW),
.RESETW (1) .RESETW (1)
) pipe_reg1 ( ) pipe_reg1 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall_out), .enable (~stall_out),
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_addr, rsp_blends, rsp_info})
); );
assign req_ready = ~stall_out; assign req_ready = ~stall_out;
@@ -157,22 +165,47 @@ module VX_tex_addr #(
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin always @(posedge clk) begin
if (req_valid && ~stall_out) begin
dpi_trace("%d: *** log_pitch=", $time);
`TRACE_ARRAY1D(log_pitch, NUM_REQS);
dpi_trace(", mip_addr=");
`TRACE_ARRAY1D(mip_addr, NUM_REQS);
dpi_trace(", req_logdims=");
`TRACE_ARRAY2D(req_logdims, 2, NUM_REQS);
dpi_trace(", clamped_lo=");
`TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS);
dpi_trace(", clamped_hi=");
`TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS);
dpi_trace("\n");
end
if (valid_s0 && ~stall_out) begin
dpi_trace("%d: *** scaled_lo=", $time);
`TRACE_ARRAY2D(scaled_lo, 2, NUM_REQS);
dpi_trace(", scaled_hi=");
`TRACE_ARRAY2D(scaled_hi, 2, NUM_REQS);
dpi_trace(", offset_u_lo=");
`TRACE_ARRAY1D(offset_u_lo, NUM_REQS);
dpi_trace(", offset_u_hi=");
`TRACE_ARRAY1D(offset_u_hi, NUM_REQS);
dpi_trace(", offset_v_lo=");
`TRACE_ARRAY1D(offset_v_lo, NUM_REQS);
dpi_trace(", offset_v_hi=");
`TRACE_ARRAY1D(offset_v_hi, NUM_REQS);
dpi_trace(", base_addr_lo=");
`TRACE_ARRAY1D(base_addr_lo, NUM_REQS);
dpi_trace(", base_addr_hi=");
`TRACE_ARRAY1D(base_addr_hi, NUM_REQS);
dpi_trace("\n");
end
if (rsp_valid && rsp_ready) begin if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, lgstride=%0d, addr=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_lgstride);
`TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS); `TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end
end end
`endif `endif
function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src,
input logic [`TEX_DIM_BITS-1:0] dim);
`IGNORE_WARNINGS_BEGIN
logic [`FIXED_BITS-1:0] out;
`IGNORE_WARNINGS_END
out = `FIXED_BITS'(src) << dim;
return out[`FIXED_FRAC +: `FIXED_INT];
endfunction
endmodule endmodule

View File

@@ -3,31 +3,26 @@
`include "VX_define.vh" `include "VX_define.vh"
`define FIXED_BITS 32 `define TEX_FXD_INT (`TEX_FXD_BITS - `TEX_FXD_FRAC)
`define FIXED_FRAC 20 `define TEX_FXD_ONE (2 ** `TEX_FXD_FRAC)
`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC) `define TEX_FXD_HALF (`TEX_FXD_ONE >> 1)
`define FIXED_ONE (2 ** `FIXED_FRAC) `define TEX_FXD_MASK (`TEX_FXD_ONE - 1)
`define FIXED_HALF (`FIXED_ONE >> 1)
`define FIXED_MASK (`FIXED_ONE - 1)
`define TEX_ADDR_BITS 32 `define TEX_ADDR_BITS 32
`define TEX_FORMAT_BITS 3 `define TEX_FORMAT_BITS 3
`define TEX_WRAP_BITS 2 `define TEX_WRAP_BITS 2
`define TEX_DIM_BITS 4
`define TEX_FILTER_BITS 1 `define TEX_FILTER_BITS 1
`define TEX_MIPOFF_BITS (2*`TEX_DIM_BITS+1)
`define TEX_MIPOFF_BITS (2*12+1) `define TEX_LGSTRIDE_MAX 2
`define TEX_STRIDE_BITS 2 `define TEX_LGSTRIDE_BITS 2
`define TEX_LOD_BITS 4
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
`define TEX_WRAP_CLAMP 0 `define TEX_WRAP_CLAMP 0
`define TEX_WRAP_REPEAT 1 `define TEX_WRAP_REPEAT 1
`define TEX_WRAP_MIRROR 2 `define TEX_WRAP_MIRROR 2
`define BLEND_FRAC 8 `define TEX_BLEND_FRAC 8
`define BLEND_ONE (2 ** `BLEND_FRAC) `define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC)
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) `define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) `define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)

View File

@@ -15,7 +15,7 @@ module VX_tex_mem #(
input wire req_valid, input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask, input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [`TEX_STRIDE_BITS-1:0] req_stride, input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride,
input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
input wire [REQ_INFOW-1:0] req_info, input wire [REQ_INFOW-1:0] req_info,
output wire req_ready, output wire req_ready,
@@ -63,23 +63,23 @@ module VX_tex_mem #(
wire [NUM_REQS-1:0] q_req_tmask; wire [NUM_REQS-1:0] q_req_tmask;
wire [`TEX_FILTER_BITS-1:0] q_req_filter; wire [`TEX_FILTER_BITS-1:0] q_req_filter;
wire [REQ_INFOW-1:0] q_req_info; wire [REQ_INFOW-1:0] q_req_info;
wire [`TEX_STRIDE_BITS-1:0] q_req_stride; wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride;
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
wire [3:0] q_dup_reqs; wire [3:0] q_dup_reqs;
assign reqq_push = req_valid && req_ready; assign reqq_push = req_valid && req_ready;
VX_fifo_queue #( VX_fifo_queue #(
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (4 * NUM_REQS * 2) + 4),
.SIZE (`LSUQ_SIZE), .SIZE (`TEXQ_SIZE),
.OUT_REG (1) .OUT_REG (1)
) req_queue ( ) req_queue (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.push (reqq_push), .push (reqq_push),
.pop (reqq_pop), .pop (reqq_pop),
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_lgstride, align_offs, dup_reqs}),
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_lgstride, q_align_offs, q_dup_reqs}),
.empty (reqq_empty), .empty (reqq_empty),
.full (reqq_full), .full (reqq_full),
`UNUSED_PIN (alm_full), `UNUSED_PIN (alm_full),
@@ -96,8 +96,12 @@ module VX_tex_mem #(
wire sent_all_ready, last_texel_sent; wire sent_all_ready, last_texel_sent;
wire req_texel_dup; wire req_texel_dup;
wire [NUM_REQS-1:0][29:0] req_texel_addr; wire [NUM_REQS-1:0][29:0] req_texel_addr;
reg [`DBG_CACHE_REQ_IDW-1:0] req_id;
wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id;
reg [1:0] req_texel_idx; reg [1:0] req_texel_idx;
reg req_texels_done; reg req_texels_done;
`UNUSED_VAR (rsp_req_id)
always @(posedge clk) begin always @(posedge clk) begin
if (reset || last_texel_sent) begin if (reset || last_texel_sent) begin
@@ -146,14 +150,19 @@ module VX_tex_mem #(
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
assign dcache_req_if.rw = {NUM_REQS{1'b0}}; assign dcache_req_if.rw = {NUM_REQS{1'b0}};
assign dcache_req_if.addr = req_texel_addr; assign dcache_req_if.addr = req_texel_addr;
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; assign dcache_req_if.byteen = {NUM_REQS{4'b0}};
assign dcache_req_if.data = 'x; assign dcache_req_if.data = 'x;
assign dcache_req_if.tag = {NUM_REQS{req_id, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}};
`ifdef DBG_CACHE_REQ_INFO always @(posedge clk) begin
assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}}; if (reset) begin
`else req_id <= `DBG_CACHE_REQ_ID(2, 0);
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; end else begin
`endif if (dcache_req_fire_any) begin
req_id <= req_id + 1;
end
end
end
// Dcache Response // Dcache Response
@@ -162,14 +171,17 @@ module VX_tex_mem #(
reg [NUM_REQS-1:0][31:0] rsp_data_qual; reg [NUM_REQS-1:0][31:0] rsp_data_qual;
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
wire [NUM_REQS-1:0][1:0] rsp_align_offs;
wire dcache_rsp_fire; wire dcache_rsp_fire;
wire [1:0] rsp_texel_idx; wire [1:0] rsp_texel_idx;
wire rsp_texel_dup; wire rsp_texel_dup;
assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2];
assign rsp_req_id = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS +: `DBG_CACHE_REQ_IDW];
`UNUSED_VAR (dcache_rsp_if.tag) `UNUSED_VAR (dcache_rsp_if.tag)
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
assign rsp_align_offs = q_align_offs[rsp_texel_idx];
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
@@ -180,12 +192,12 @@ module VX_tex_mem #(
reg [31:0] rsp_data_shifted; reg [31:0] rsp_data_shifted;
always @(*) begin always @(*) begin
rsp_data_shifted[31:16] = src_data[31:16]; rsp_data_shifted[31:16] = src_data[31:16];
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; rsp_data_shifted[15:0] = rsp_align_offs[i][1] ? src_data[31:16] : src_data[15:0];
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; rsp_data_shifted[7:0] = rsp_align_offs[i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
end end
always @(*) begin always @(*) begin
case (q_req_stride) case (q_req_lgstride)
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
default: rsp_data_qual[i] = rsp_data_shifted; default: rsp_data_qual[i] = rsp_data_shifted;
@@ -266,20 +278,20 @@ module VX_tex_mem #(
always @(posedge clk) begin always @(posedge clk) begin
if (dcache_req_fire_any) begin if (dcache_req_fire_any) begin
dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, addr=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_id, req_texel_idx);
`TRACE_ARRAY1D(req_texel_addr, NUM_REQS); `TRACE_ARRAY1D(req_texel_addr, NUM_REQS);
dpi_trace(", is_dup=%b\n", req_texel_dup); dpi_trace(", is_dup=%b\n", req_texel_dup);
end end
if (dcache_rsp_fire) begin if (dcache_rsp_fire) begin
dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, data=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_req_id, rsp_texel_idx);
`TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end
if (req_valid && req_ready) begin if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, addr=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride);
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end

View File

@@ -12,7 +12,7 @@ module VX_tex_sampler #(
input wire req_valid, input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask, input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FORMAT_BITS-1:0] req_format, input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends, input wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends,
input wire [NUM_REQS-1:0][3:0][31:0] req_data, input wire [NUM_REQS-1:0][3:0][31:0] req_data,
input wire [REQ_INFOW-1:0] req_info, input wire [REQ_INFOW-1:0] req_info,
output wire req_ready, output wire req_ready,
@@ -32,7 +32,7 @@ module VX_tex_sampler #(
wire [REQ_INFOW-1:0] req_info_s0; wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0; wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s0;
wire [NUM_REQS-1:0][31:0] texel_v; wire [NUM_REQS-1:0][31:0] texel_v;
wire stall_out; wire stall_out;
@@ -52,7 +52,7 @@ module VX_tex_sampler #(
end end
wire [7:0] beta = req_blends[i][0]; wire [7:0] beta = req_blends[i][0];
wire [8:0] alpha = `BLEND_ONE - beta; wire [8:0] alpha = `TEX_BLEND_ONE - beta;
VX_tex_lerp #( VX_tex_lerp #(
) tex_lerp_ul ( ) tex_lerp_ul (
@@ -76,7 +76,7 @@ module VX_tex_sampler #(
end end
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)),
.RESETW (1) .RESETW (1)
) pipe_reg0 ( ) pipe_reg0 (
.clk (clk), .clk (clk),
@@ -88,7 +88,7 @@ module VX_tex_sampler #(
for (genvar i = 0; i < NUM_REQS; i++) begin for (genvar i = 0; i < NUM_REQS; i++) begin
wire [7:0] beta = blend_v_s0[i]; wire [7:0] beta = blend_v_s0[i];
wire [8:0] alpha = `BLEND_ONE - beta; wire [8:0] alpha = `TEX_BLEND_ONE - beta;
VX_tex_lerp #( VX_tex_lerp #(
) tex_lerp_v ( ) tex_lerp_v (

View File

@@ -4,11 +4,11 @@ module VX_tex_stride #(
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
input wire [`TEX_FORMAT_BITS-1:0] format, input wire [`TEX_FORMAT_BITS-1:0] format,
output wire [`TEX_STRIDE_BITS-1:0] log_stride output wire [`TEX_LGSTRIDE_BITS-1:0] log_stride
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
reg [`TEX_STRIDE_BITS-1:0] log_stride_r; reg [`TEX_LGSTRIDE_BITS-1:0] log_stride_r;
always @(*) begin always @(*) begin
case (format) case (format)

View File

@@ -20,13 +20,13 @@ module VX_tex_unit #(
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A; localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A;
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0];
reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0];
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
// CSRs programming // CSRs programming
@@ -35,38 +35,46 @@ module VX_tex_unit #(
`UNUSED_VAR (csrs_dirty) `UNUSED_VAR (csrs_dirty)
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
always @(posedge clk) begin always @(posedge clk) begin
if (tex_csr_if.write_enable) begin if (tex_csr_if.write_enable) begin
case (tex_csr_if.write_addr) case (tex_csr_if.write_addr)
`CSR_TEX_ADDR(i) : begin `CSR_TEX(i, `TEX_STATE_ADDR) : begin
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_FORMAT(i) : begin `CSR_TEX(i, `TEX_STATE_FORMAT) : begin
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_WRAP(i) : begin `CSR_TEX(i, `TEX_STATE_WRAPU) : begin
tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_FILTER(i) : begin `CSR_TEX(i, `TEX_STATE_WRAPV) : begin
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX(i, `TEX_STATE_FILTER) : begin
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_MIPOFF(i) : begin `CSR_TEX(i, `TEX_STATE_WIDTH) : begin
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_WIDTH(i) : begin `CSR_TEX(i, `TEX_STATE_HEIGHT) : begin
tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
csrs_dirty[i] <= 1; csrs_dirty[i] <= 1;
end end
`CSR_TEX_HEIGHT(i) : begin default: begin
tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin
csrs_dirty[i] <= 1; `IGNORE_WARNINGS_BEGIN
if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin
`IGNORE_WARNINGS_END
tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
csrs_dirty[i] <= 1;
end
end
end end
endcase endcase
end end
@@ -78,14 +86,15 @@ module VX_tex_unit #(
// mipmap attributes // mipmap attributes
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims; wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][`TEX_LOD_BITS-1:0];
assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
assign sel_dims[i] = tex_dims[unit][mip_level]; assign sel_logdims[i][0] = (tex_logdims[unit][0] - mip_level);
assign sel_logdims[i][1] = (tex_logdims[unit][1] - mip_level);
end end
// address generation // address generation
@@ -93,8 +102,8 @@ module VX_tex_unit #(
wire mem_req_valid; wire mem_req_valid;
wire [`NUM_THREADS-1:0] mem_req_tmask; wire [`NUM_THREADS-1:0] mem_req_tmask;
wire [`TEX_FILTER_BITS-1:0] mem_req_filter; wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride;
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
wire [REQ_INFOW_A-1:0] mem_req_info; wire [REQ_INFOW_A-1:0] mem_req_info;
wire mem_req_ready; wire mem_req_ready;
@@ -113,16 +122,16 @@ module VX_tex_unit #(
.req_format (tex_format[tex_req_if.unit]), .req_format (tex_format[tex_req_if.unit]),
.req_filter (tex_filter[tex_req_if.unit]), .req_filter (tex_filter[tex_req_if.unit]),
.req_wraps (tex_wraps[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]),
.req_baseaddr (tex_baddr[tex_req_if.unit]), .req_baseaddr(tex_baddr[tex_req_if.unit]),
.req_mipoff (sel_mipoff), .req_mipoff (sel_mipoff),
.req_logdims (sel_dims), .req_logdims(sel_logdims),
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
.req_ready (tex_req_if.ready), .req_ready (tex_req_if.ready),
.rsp_valid (mem_req_valid), .rsp_valid (mem_req_valid),
.rsp_tmask (mem_req_tmask), .rsp_tmask (mem_req_tmask),
.rsp_filter (mem_req_filter), .rsp_filter (mem_req_filter),
.rsp_stride (mem_req_stride), .rsp_lgstride(mem_req_lgstride),
.rsp_addr (mem_req_addr), .rsp_addr (mem_req_addr),
.rsp_blends (mem_req_blends), .rsp_blends (mem_req_blends),
.rsp_info (mem_req_info), .rsp_info (mem_req_info),
@@ -142,8 +151,8 @@ module VX_tex_unit #(
.REQ_INFOW (REQ_INFOW_M), .REQ_INFOW (REQ_INFOW_M),
.NUM_REQS (`NUM_THREADS) .NUM_REQS (`NUM_THREADS)
) tex_mem ( ) tex_mem (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
// memory interface // memory interface
.dcache_req_if (dcache_req_if), .dcache_req_if (dcache_req_if),
@@ -153,7 +162,7 @@ module VX_tex_unit #(
.req_valid (mem_req_valid), .req_valid (mem_req_valid),
.req_tmask (mem_req_tmask), .req_tmask (mem_req_tmask),
.req_filter(mem_req_filter), .req_filter(mem_req_filter),
.req_stride(mem_req_stride), .req_lgstride(mem_req_lgstride),
.req_addr (mem_req_addr), .req_addr (mem_req_addr),
.req_info ({mem_req_blends, mem_req_info}), .req_info ({mem_req_blends, mem_req_info}),
.req_ready (mem_req_ready), .req_ready (mem_req_ready),
@@ -168,7 +177,7 @@ module VX_tex_unit #(
// apply sampler // apply sampler
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends;
wire [`TEX_FORMAT_BITS-1:0] rsp_format; wire [`TEX_FORMAT_BITS-1:0] rsp_format;
wire [REQ_INFOW_S-1:0] rsp_info; wire [REQ_INFOW_S-1:0] rsp_info;
@@ -205,13 +214,12 @@ module VX_tex_unit #(
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
if (csrs_dirty[i]) begin if (csrs_dirty[i]) begin
dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_logwidth=%0h\n", $time, CORE_ID, i, tex_logdims[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_logheight=%0h\n", $time, CORE_ID, i, tex_logdims[i][1]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]);
end end
end end

View File

@@ -4,19 +4,19 @@ module VX_tex_wrap #(
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
input wire [`TEX_WRAP_BITS-1:0] wrap_i, input wire [`TEX_WRAP_BITS-1:0] wrap_i,
input wire [31:0] coord_i, input wire [`TEX_FXD_BITS-1:0] coord_i,
output wire [`FIXED_FRAC-1:0] coord_o output wire [`TEX_FXD_FRAC-1:0] coord_o
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
reg [`FIXED_FRAC-1:0] coord_r; reg [`TEX_FXD_FRAC-1:0] coord_r;
wire [`FIXED_FRAC-1:0] clamp; wire [`TEX_FXD_FRAC-1:0] clamp;
VX_tex_sat #( VX_tex_sat #(
.IN_W (32), .IN_W (`TEX_FXD_BITS),
.OUT_W (`FIXED_FRAC) .OUT_W (`TEX_FXD_FRAC)
) sat_fx ( ) sat_fx (
.data_in (coord_i), .data_in (coord_i),
.data_out (clamp) .data_out (clamp)
@@ -27,9 +27,9 @@ module VX_tex_wrap #(
`TEX_WRAP_CLAMP: `TEX_WRAP_CLAMP:
coord_r = clamp; coord_r = clamp;
`TEX_WRAP_MIRROR: `TEX_WRAP_MIRROR:
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; coord_r = coord_i[`TEX_FXD_FRAC-1:0] ^ {`TEX_FXD_FRAC{coord_i[`TEX_FXD_FRAC]}};
default: //`TEX_WRAP_REPEAT default: //`TEX_WRAP_REPEAT
coord_r = coord_i[`FIXED_FRAC-1:0]; coord_r = coord_i[`TEX_FXD_FRAC-1:0];
endcase endcase
end end

View File

@@ -23,7 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)

View File

@@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw
PROJECT = libvortexrt PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
OBJS := $(addsuffix .o, $(notdir $(SRCS))) OBJS := $(addsuffix .o, $(notdir $(SRCS)))

View File

@@ -5,62 +5,7 @@
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define vx_csr_swap(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_read(csr) ({ \
register unsigned __v; \
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
__v; \
})
#define vx_csr_write(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \
})
#ifdef __ASSEMBLY__ #ifdef __ASSEMBLY__
#define __ASM_STR(x) x #define __ASM_STR(x) x
@@ -68,72 +13,77 @@ extern "C" {
#define __ASM_STR(x) #x #define __ASM_STR(x) #x
#endif #endif
#define vx_csr_swap(csr, val) ({ \ #define csr_read(csr) ({ \
unsigned __v = (unsigned )(val); \ unsigned __r; \
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ __asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
__v; \
})
#define vx_csr_read(csr) ({ \
register unsigned __v; \
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
__v; \
})
#define vx_csr_write(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \ __r; \
}) })
// Lerp instruction #define csr_write(csr, val) ({ \
#define vx_lerp(a, b, s) ({ \ unsigned __v = (unsigned)(val); \
unsigned __r; \ if (__builtin_constant_p(val) && __v < 32) \
unsigned __a = a; \ __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
unsigned __b = b; \ else \
unsigned __s = s; \ __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
__asm__ __volatile__ (".insn r4 0x6b, 7, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__a), "r"(__b), "r"(__s)); \ })
#define csr_swap(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_read_set(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_set(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_read_clear(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_clear(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
})
// Texture load
#define vx_tex(unit, u, v, lod) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
__r; \ __r; \
}) })
// Conditional move // Conditional move
#define vx_cmov(c, t, f) ({ \ #define vx_cmov(c, t, f) ({ \
unsigned __r; \ unsigned __r; \
unsigned __c = c; \ __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
unsigned __t = t; \
unsigned __f = f; \
__asm__ __volatile__ (".insn r4 0x6b, 6, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__c), "r"(__t), "r"(__f)); \
__r; \ __r; \
}) })
@@ -171,7 +121,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
// Prefetch // Prefetch
inline void vx_prefetch(unsigned addr) { inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
} }
// Return active warp's thread id // Return active warp's thread id

890
runtime/src/tinyprintf.c Normal file
View File

@@ -0,0 +1,890 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
// embedded systems with a very limited resources. These routines are thread
// safe and reentrant!
// Use this instead of the bloated standard/newlib printf cause these use
// malloc for printf (and may not be thread safe).
//
///////////////////////////////////////////////////////////////////////////////
#include <stdbool.h>
#include <stdint.h>
#include "tinyprintf.h"
#include "vx_print.h"
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
// printf_config.h header file
// default: undefined
#ifdef PRINTF_INCLUDE_CONFIG_H
#include "printf_config.h"
#endif
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
// numeric number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_NTOA_BUFFER_SIZE
#define PRINTF_NTOA_BUFFER_SIZE 32U
#endif
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
// float number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_FTOA_BUFFER_SIZE
#define PRINTF_FTOA_BUFFER_SIZE 32U
#endif
// support for the floating point type (%f)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
#define PRINTF_SUPPORT_FLOAT
#endif
// support for exponential floating point notation (%e/%g)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
#define PRINTF_SUPPORT_EXPONENTIAL
#endif
// define the default floating point precision
// default: 6 digits
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
#endif
// define the largest float suitable to print with %f
// default: 1e9
#ifndef PRINTF_MAX_FLOAT
#define PRINTF_MAX_FLOAT 1e9
#endif
// support for the long long types (%llu or %p)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
#define PRINTF_SUPPORT_LONG_LONG
#endif
// support for the ptrdiff_t type (%t)
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
#define PRINTF_SUPPORT_PTRDIFF_T
#endif
///////////////////////////////////////////////////////////////////////////////
// internal flag definitions
#define FLAGS_ZEROPAD (1U << 0U)
#define FLAGS_LEFT (1U << 1U)
#define FLAGS_PLUS (1U << 2U)
#define FLAGS_SPACE (1U << 3U)
#define FLAGS_HASH (1U << 4U)
#define FLAGS_UPPERCASE (1U << 5U)
#define FLAGS_CHAR (1U << 6U)
#define FLAGS_SHORT (1U << 7U)
#define FLAGS_LONG (1U << 8U)
#define FLAGS_LONG_LONG (1U << 9U)
#define FLAGS_PRECISION (1U << 10U)
#define FLAGS_ADAPT_EXP (1U << 11U)
// import float.h for DBL_MAX
#if defined(PRINTF_SUPPORT_FLOAT)
#include <float.h>
#endif
// output function type
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
// wrapper (used as buffer) for output function type
typedef struct {
void (*fct)(char character, void* arg);
void* arg;
} out_fct_wrap_type;
// internal buffer output
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
{
if (idx < maxlen) {
((char*)buffer)[idx] = character;
}
}
// internal null output
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)character; (void)buffer; (void)idx; (void)maxlen;
}
// internal _putchar wrapper
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)buffer; (void)idx; (void)maxlen;
if (character) {
vx_putchar(character);
}
}
// internal output function wrapper
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)idx; (void)maxlen;
if (character) {
// buffer is the output fct pointer
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
}
}
// internal secure strlen
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
{
const char* s;
for (s = str; *s && maxsize--; ++s);
return (unsigned int)(s - str);
}
// internal test if char is a digit (0-9)
// \return true if char is a digit
static inline bool _is_digit(char ch)
{
return (ch >= '0') && (ch <= '9');
}
// internal ASCII string to unsigned int conversion
static unsigned int _atoi(const char** str)
{
unsigned int i = 0U;
while (_is_digit(**str)) {
i = i * 10U + (unsigned int)(*((*str)++) - '0');
}
return i;
}
// output the specified string in reverse, taking care of any zero-padding
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
{
const size_t start_idx = idx;
// pad spaces up to given width
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
for (size_t i = len; i < width; i++) {
out(' ', buffer, idx++, maxlen);
}
}
// reverse string
while (len) {
out(buf[--len], buffer, idx++, maxlen);
}
// append pad spaces up to given width
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) {
out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
// internal itoa format
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
{
// pad leading zeros
if (!(flags & FLAGS_LEFT)) {
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
// handle hash
if (flags & FLAGS_HASH) {
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
len--;
if (len && (base == 16U)) {
len--;
}
}
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'x';
}
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'X';
}
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'b';
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
buf[len++] = '0';
}
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
// internal itoa for 'long' type
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
// internal itoa for 'long long' type
#if defined(PRINTF_SUPPORT_LONG_LONG)
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
#endif // PRINTF_SUPPORT_LONG_LONG
#if defined(PRINTF_SUPPORT_FLOAT)
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
#endif
// internal ftoa for fixed decimal floating point
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_FTOA_BUFFER_SIZE];
size_t len = 0U;
double diff = 0.0;
// powers of 10
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
// test for special values
if (value != value)
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
if (value < -DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
if (value > DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
// test for very large values
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
#else
return 0U;
#endif
}
// test for negative
bool negative = false;
if (value < 0) {
negative = true;
value = 0 - value;
}
// set default precision, if not set explicitly
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
buf[len++] = '0';
prec--;
}
int whole = (int)value;
double tmp = (value - whole) * pow10[prec];
unsigned long frac = (unsigned long)tmp;
diff = tmp - frac;
if (diff > 0.5) {
++frac;
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
if (frac >= pow10[prec]) {
frac = 0;
++whole;
}
}
else if (diff < 0.5) {
}
else if ((frac == 0U) || (frac & 1U)) {
// if halfway, round up if odd OR if last digit is 0
++frac;
}
if (prec == 0U) {
diff = value - (double)whole;
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
// exactly 0.5 and ODD, then round up
// 1.5 -> 2, but 2.5 -> 2
++whole;
}
}
else {
unsigned int count = prec;
// now do fractional part, as an unsigned number
while (len < PRINTF_FTOA_BUFFER_SIZE) {
--count;
buf[len++] = (char)(48U + (frac % 10U));
if (!(frac /= 10U)) {
break;
}
}
// add extra 0s
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
buf[len++] = '0';
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
// add decimal
buf[len++] = '.';
}
}
// do whole part, number is reversed
while (len < PRINTF_FTOA_BUFFER_SIZE) {
buf[len++] = (char)(48 + (whole % 10));
if (!(whole /= 10)) {
break;
}
}
// pad leading zeros
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
// check for NaN and special values
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
}
// determine the sign
const bool negative = value < 0;
if (negative) {
value = -value;
}
// default precision
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// determine the decimal exponent
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
union {
uint64_t U;
double F;
} conv;
conv.F = value;
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
// now we want to compute 10^expval but we want to be sure it won't overflow
exp2 = (int)(expval * 3.321928094887362 + 0.5);
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
const double z2 = z * z;
conv.U = (uint64_t)(exp2 + 1023) << 52U;
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
// correct for rounding errors
if (value < conv.F) {
expval--;
conv.F /= 10;
}
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
// in "%g" mode, "prec" is the number of *significant figures* not decimals
if (flags & FLAGS_ADAPT_EXP) {
// do we want to fall-back to "%f" mode?
if ((value >= 1e-4) && (value < 1e6)) {
if ((int)prec > expval) {
prec = (unsigned)((int)prec - expval - 1);
}
else {
prec = 0;
}
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
// no characters in exponent
minwidth = 0U;
expval = 0;
}
else {
// we use one sigfig for the whole part
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
--prec;
}
}
}
// will everything fit?
unsigned int fwidth = width;
if (width > minwidth) {
// we didn't fall-back so subtract the characters required for the exponent
fwidth -= minwidth;
} else {
// not enough characters, so go back to default sizing
fwidth = 0U;
}
if ((flags & FLAGS_LEFT) && minwidth) {
// if we're padding on the right, DON'T pad the floating part
fwidth = 0U;
}
// rescale the float value
if (expval) {
value /= conv.F;
}
// output the floating part
const size_t start_idx = idx;
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
// output the exponent part
if (minwidth) {
// output the exponential symbol
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
// output the exponent value
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
// might need to right-pad spaces
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
// internal vsnprintf
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
unsigned int flags, width, precision, n;
size_t idx = 0U;
if (!buffer) {
// use null output function
out = _out_null;
}
while (*format)
{
// format specifier? %[flags][width][.precision][length]
if (*format != '%') {
// no
out(*format, buffer, idx++, maxlen);
format++;
continue;
}
else {
// yes, evaluate it
format++;
}
// evaluate flags
flags = 0U;
do {
switch (*format) {
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
default : n = 0U; break;
}
} while (n);
// evaluate width field
width = 0U;
if (_is_digit(*format)) {
width = _atoi(&format);
}
else if (*format == '*') {
const int w = va_arg(va, int);
if (w < 0) {
flags |= FLAGS_LEFT; // reverse padding
width = (unsigned int)-w;
}
else {
width = (unsigned int)w;
}
format++;
}
// evaluate precision field
precision = 0U;
if (*format == '.') {
flags |= FLAGS_PRECISION;
format++;
if (_is_digit(*format)) {
precision = _atoi(&format);
}
else if (*format == '*') {
const int prec = (int)va_arg(va, int);
precision = prec > 0 ? (unsigned int)prec : 0U;
format++;
}
}
// evaluate length field
switch (*format) {
case 'l' :
flags |= FLAGS_LONG;
format++;
if (*format == 'l') {
flags |= FLAGS_LONG_LONG;
format++;
}
break;
case 'h' :
flags |= FLAGS_SHORT;
format++;
if (*format == 'h') {
flags |= FLAGS_CHAR;
format++;
}
break;
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
case 't' :
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
#endif
case 'j' :
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
case 'z' :
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
default :
break;
}
// evaluate specifier
switch (*format) {
case 'd' :
case 'i' :
case 'u' :
case 'x' :
case 'X' :
case 'o' :
case 'b' : {
// set the base
unsigned int base;
if (*format == 'x' || *format == 'X') {
base = 16U;
}
else if (*format == 'o') {
base = 8U;
}
else if (*format == 'b') {
base = 2U;
}
else {
base = 10U;
flags &= ~FLAGS_HASH; // no hash for dec format
}
// uppercase
if (*format == 'X') {
flags |= FLAGS_UPPERCASE;
}
// no plus or space flag for u, x, X, o, b
if ((*format != 'i') && (*format != 'd')) {
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
}
// ignore '0' flag when precision is given
if (flags & FLAGS_PRECISION) {
flags &= ~FLAGS_ZEROPAD;
}
// convert the integer
if ((*format == 'i') || (*format == 'd')) {
// signed
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
const long long value = va_arg(va, long long);
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
const long value = va_arg(va, long);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
else {
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
}
else {
// unsigned
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
}
else {
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
}
}
format++;
break;
}
#if defined(PRINTF_SUPPORT_FLOAT)
case 'f' :
case 'F' :
if (*format == 'F') flags |= FLAGS_UPPERCASE;
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
case 'e':
case 'E':
case 'g':
case 'G':
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
case 'c' : {
unsigned int l = 1U;
// pre padding
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// char output
out((char)va_arg(va, int), buffer, idx++, maxlen);
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 's' : {
const char* p = va_arg(va, char*);
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
// pre padding
if (flags & FLAGS_PRECISION) {
l = (l < precision ? l : precision);
}
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// string output
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
out(*(p++), buffer, idx++, maxlen);
}
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 'p' : {
width = sizeof(void*) * 2U;
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
#if defined(PRINTF_SUPPORT_LONG_LONG)
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
if (is_ll) {
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
}
else {
#endif
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
#if defined(PRINTF_SUPPORT_LONG_LONG)
}
#endif
format++;
break;
}
case '%' :
out('%', buffer, idx++, maxlen);
format++;
break;
default :
out(*format, buffer, idx++, maxlen);
format++;
break;
}
}
// termination
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
// return written chars without terminating \0
return (int)idx;
}
int tiny_printf(const char* format, ...) {
va_list va;
va_start(va, format);
char buffer[1];
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_sprintf(char* buffer, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
va_end(va);
return ret;
}
int tiny_vprintf(const char* format, va_list va) {
char buffer[1];
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
}
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
return _vsnprintf(_out_buffer, buffer, count, format, va);
}

86
runtime/src/tinyprintf.h Normal file
View File

@@ -0,0 +1,86 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
// embedded systems with a very limited resources.
// Use this instead of bloated standard/newlib printf.
// These routines are thread safe and reentrant.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _TINYPRINTF_H_
#define _TINYPRINTF_H_
#include <stdarg.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* Tiny printf implementation
* You have to implement _putchar if you use printf()
* To avoid conflicts with the regular printf() API it is overridden by macro defines
* and internal underscore-appended functions like printf_() are used
* \param format A string that specifies the format of the output
* \return The number of characters that are written into the array, not counting the terminating null character
*/
int tiny_printf(const char* format, ...);
/**
* Tiny sprintf implementation
* Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
* \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
* \param format A string that specifies the format of the output
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_sprintf(char* buffer, const char* format, ...);
/**
* Tiny snprintf/vsnprintf implementation
* \param buffer A pointer to the buffer where to store the formatted string
* \param count The maximum number of characters to store in the buffer, including a terminating null character
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that COULD have been written into the buffer, not counting the terminating
* null character. A value equal or larger than count indicates truncation. Only when the returned value
* is non-negative and less than count, the string has been completely written.
*/
int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
/**
* Tiny vprintf implementation
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_vprintf(const char* format, va_list va);
#ifdef __cplusplus
}
#endif
#endif // _TINYPRINTF_H_

View File

@@ -4,10 +4,10 @@
#include <stdint.h> #include <stdint.h>
#define DUMP_CSR_4(d, s) \ #define DUMP_CSR_4(d, s) \
csr_mem[d + 0] = vx_csr_read(s + 0); \ csr_mem[d + 0] = csr_read(s + 0); \
csr_mem[d + 1] = vx_csr_read(s + 1); \ csr_mem[d + 1] = csr_read(s + 1); \
csr_mem[d + 2] = vx_csr_read(s + 2); \ csr_mem[d + 2] = csr_read(s + 2); \
csr_mem[d + 3] = vx_csr_read(s + 3); csr_mem[d + 3] = csr_read(s + 3);
#define DUMP_CSR_32(d, s) \ #define DUMP_CSR_32(d, s) \
DUMP_CSR_4(d + 0, s + 0) \ DUMP_CSR_4(d + 0, s + 0) \

View File

@@ -4,7 +4,9 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include "tinyprintf.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@@ -26,46 +28,19 @@ typedef struct {
int precision; int precision;
} putfloat_arg_t; } putfloat_arg_t;
static void __printf_cb(printf_arg_t* arg) { static void __putint_cb(const putint_arg_t* arg) {
arg->ret = vprintf(arg->format, *arg->va);
}
int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial((vx_serial_cb)__printf_cb, &arg);
return arg.ret;
}
int vx_printf(const char * format, ...) {
int ret;
va_list va;
va_start(va, format);
ret = vx_vprintf(format, va);
va_end(va);
return ret;
}
static void __putint_cb(const putint_arg_t* arg) {
char tmp[33]; char tmp[33];
float value = arg->value; float value = arg->value;
int base = arg->base; int base = arg->base;
itoa(value, tmp, base); itoa(value, tmp, base);
for (int i = 0; i < 33; ++i) { for (int i = 0; i < 33; ++i) {
int c = tmp[i]; int c = tmp[i];
if (!c) break; if (!c)
break;
vx_putchar(c); vx_putchar(c);
} }
} }
void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
static void __putfloat_cb(const putfloat_arg_t* arg) { static void __putfloat_cb(const putfloat_arg_t* arg) {
float value = arg->value; float value = arg->value;
int precision = arg->precision; int precision = arg->precision;
@@ -79,6 +54,17 @@ static void __putfloat_cb(const putfloat_arg_t* arg) {
} }
} }
static void __vprintf_cb(printf_arg_t* arg) {
arg->ret = tiny_vprintf(arg->format, *arg->va);
}
void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
void vx_putfloat(float value, int precision) { void vx_putfloat(float value, int precision) {
putfloat_arg_t arg; putfloat_arg_t arg;
arg.value = value; arg.value = value;
@@ -86,6 +72,23 @@ void vx_putfloat(float value, int precision) {
vx_serial((vx_serial_cb)__putfloat_cb, &arg); vx_serial((vx_serial_cb)__putfloat_cb, &arg);
} }
int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
return arg.ret;
}
int vx_printf(const char * format, ...) {
int ret;
va_list va;
va_start(va, format);
ret = vx_vprintf(format, va);
va_end(va);
return ret;
}
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@@ -16,7 +16,10 @@ int _open(const char *name, int flags, int mode) { return -1; }
int _read(int file, char *ptr, int len) { return -1; } int _read(int file, char *ptr, int len) { return -1; }
caddr_t _sbrk(int incr) { return 0; } caddr_t _sbrk(int incr) {
__asm__ __volatile__("ebreak");
return 0;
}
int _write(int file, char *ptr, int len) { int _write(int file, char *ptr, int len) {
int i; int i;

79
sim/common/bitmanip.h Normal file
View File

@@ -0,0 +1,79 @@
#pragma once
#include <cstdint>
#include <algorithm>
#include <assert.h>
constexpr uint32_t count_leading_zeros(uint32_t value) {
return value ? __builtin_clz(value) : 32;
}
constexpr uint32_t count_trailing_zeros(uint32_t value) {
return value ? __builtin_ctz(value) : 32;
}
constexpr bool ispow2(uint32_t value) {
return value && !(value & (value - 1));
}
constexpr uint32_t log2ceil(uint32_t value) {
return 32 - count_leading_zeros(value - 1);
}
inline unsigned log2up(uint32_t value) {
return std::max<uint32_t>(1, log2ceil(value));
}
constexpr unsigned log2floor(uint32_t value) {
return 31 - count_leading_zeros(value);
}
constexpr unsigned ceil2(uint32_t value) {
return 32 - count_leading_zeros(value);
}
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits & ~(1ull << index);
}
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits | (1ull << index);
}
inline bool bit_get(uint64_t bits, uint32_t index) {
assert(index <= 63);
return (bits >> index) & 0x1;
}
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
return bits & ~mask;
}
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t dirty = (value << (shift + start)) >> shift;
return bit_clrw(bits, start, end) | dirty;
}
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
return (bits << shift) >> (shift + start);
}
// Apply integer sign extension
inline uint32_t sext32(uint32_t word, uint32_t width) {
assert(width > 1);
assert(width <= 32);
uint32_t mask = (1 << width) - 1;
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
}

419
sim/common/fixed.h Normal file
View File

@@ -0,0 +1,419 @@
#pragma once
#include <cstdint>
#include <cstdlib>
#include <assert.h>
template <uint32_t F, typename T = int32_t>
class Fixed {
private:
template <uint32_t F2, typename T2>
struct Cast {
private:
template <bool isF2Bigger, bool isT2Bigger> struct Tag {};
inline static T Convert(T2 value, Tag<false, false>) {
return static_cast<T>(value) << (F - F2);
}
inline static T Convert(T2 value, Tag<false, true>) {
return static_cast<T>(value) >> (F2 - F);
}
inline static T Convert(T2 value, Tag<true, false>) {
return static_cast<T>(value << (F - F2));
}
inline static T Convert(T2 value, Tag<true, true>) {
return static_cast<T>(value >> (F2 - F));
}
public:
inline static T Convert(T2 value) {
return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{});
}
};
public:
using data_type = T;
static constexpr uint32_t FRAC = F;
static constexpr uint32_t INT = sizeof(T) * 8 - FRAC;
static constexpr uint32_t HFRAC = FRAC >> 1;
static constexpr T ONE = static_cast<T>(1) << FRAC;
static constexpr T MASK = ONE - 1;
static constexpr T IMASK = ~MASK;
static constexpr T HALF = ONE >> 1;
static constexpr T TWO = ONE << 1;
Fixed() {}
explicit Fixed(int64_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(uint64_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(int32_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(uint32_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(int16_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(uint16_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(int8_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
explicit Fixed(uint8_t rhs)
: data_(static_cast<T>(rhs << FRAC)) {
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
}
template <uint32_t F2, typename T2>
explicit Fixed(Fixed<F2, T2> rhs)
: data_(Cast<F2, T2>::Convert(rhs.data()))
{}
explicit Fixed(float rhs)
: data_(static_cast<T>(rhs * ONE)) {
assert(data_ == static_cast<T>(rhs * ONE));
}
bool operator==(Fixed rhs) const {
return (data_ == rhs.data_);
}
bool operator!=(Fixed rhs) const {
return (data_ != rhs.data_);
}
bool operator<(Fixed rhs) const {
return (data_ < rhs.data_);
}
bool operator<=(Fixed rhs) const {
return (data_ <= rhs.data_);
}
bool operator>(Fixed rhs) const {
return (data_ > rhs.data_);
}
bool operator>=(Fixed rhs) const {
return (data_ >= rhs.data_);
}
Fixed operator-() const {
return make(-data_);
}
Fixed operator+=(Fixed rhs) {
*this = (*this) + rhs;
return *this;
}
Fixed operator-=(Fixed rhs) {
*this = (*this) - rhs;
return *this;
}
Fixed operator*=(Fixed rhs) {
*this = (*this) * rhs;
return *this;
}
Fixed operator/=(Fixed rhs) {
*this = (*this) / rhs;
return *this;
}
template <uint32_t F2, typename T2>
Fixed operator*=(Fixed<F2, T2> rhs) {
*this = (*this) * rhs;
return *this;
}
template <uint32_t F2, typename T2>
Fixed operator/=(Fixed<F2, T2> rhs) {
*this = (*this) / rhs;
return *this;
}
Fixed operator*=(int32_t rhs) {
*this = (*this) * rhs;
return *this;
}
Fixed operator*=(uint32_t rhs) {
*this = (*this) * rhs;
return *this;
}
Fixed operator*=(float rhs) {
*this = (*this) * rhs;
return *this;
}
Fixed operator/=(int32_t rhs) {
*this = (*this) / rhs;
return *this;
}
Fixed operator/=(uint32_t rhs) {
*this = (*this) / rhs;
return *this;
}
Fixed operator/=(float rhs) {
*this = (*this) / rhs;
return *this;
}
friend Fixed operator+(Fixed lhs, Fixed rhs) {
assert((static_cast<int64_t>(lhs.data_) + rhs.data_) ==
(lhs.data_ + rhs.data_));
return Fixed::make(lhs.data_ + rhs.data_);
}
friend Fixed operator-(Fixed lhs, Fixed rhs) {
assert((static_cast<int64_t>(lhs.data_) - rhs.data_) ==
(lhs.data_ - rhs.data_));
return Fixed::make(lhs.data_ - rhs.data_);
}
friend Fixed operator*(Fixed lhs, Fixed rhs) {
return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data_) >> FRAC);
}
template <uint32_t F2, typename T2>
friend Fixed operator*(Fixed lhs, Fixed<F2, T2> rhs) {
return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data()) >> F2);
}
friend Fixed operator/(Fixed lhs, Fixed rhs) {
assert(rhs.data_ != 0);
return Fixed::make((static_cast<int64_t>(lhs.data_) << FRAC) / rhs.data_);
}
template <uint32_t F2, typename T2>
friend Fixed operator/(Fixed lhs, Fixed<F2, T2> rhs) {
assert(rhs.data() != 0);
return Fixed::make((static_cast<int64_t>(lhs.data_) << F2) / rhs.data());
}
friend Fixed operator*(Fixed lhs, float rhs) {
return static_cast<float>(lhs) * rhs;
}
friend Fixed operator*(float lhs, Fixed rhs) {
return lhs * static_cast<float>(rhs);
}
friend Fixed operator/(Fixed lhs, float rhs) {
return static_cast<float>(lhs) / rhs;
}
friend Fixed operator/(float lhs, Fixed rhs) {
return lhs / static_cast<float>(rhs);
}
friend Fixed operator*(Fixed lhs, char rhs) {
return lhs * static_cast<int32_t>(rhs);
}
friend Fixed operator*(char lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, char rhs) {
return lhs / static_cast<int32_t>(rhs);
}
friend Fixed operator/(char lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator*(Fixed lhs, uint8_t rhs) {
return lhs * static_cast<int32_t>(rhs);
}
friend Fixed operator*(uint8_t lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, uint8_t rhs) {
return lhs / static_cast<int32_t>(rhs);
}
friend Fixed operator/(uint8_t lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator*(Fixed lhs, short rhs) {
return lhs * static_cast<int32_t>(rhs);
}
friend Fixed operator*(short lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, short rhs) {
return lhs / static_cast<int32_t>(rhs);
}
friend Fixed operator/(short lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator*(Fixed lhs, uint16_t rhs) {
return lhs * static_cast<int32_t>(rhs);
}
friend Fixed operator*(uint16_t lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, uint16_t rhs) {
return lhs / static_cast<int32_t>(rhs);
}
friend Fixed operator/(uint16_t lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator*(Fixed lhs, int32_t rhs) {
auto value = static_cast<T>(lhs.data_ * rhs);
assert((lhs.data_ * static_cast<int64_t>(rhs)) == value);
return Fixed::make(value);
}
friend Fixed operator*(int32_t lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, int32_t rhs) {
assert(rhs);
auto value = static_cast<T>(lhs.data_ / rhs);
return Fixed::make(value);
}
friend Fixed operator/(int32_t lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator*(Fixed lhs, uint32_t rhs) {
auto value = static_cast<T>(lhs.data_ << rhs);
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
return Fixed::make(value);
}
friend Fixed operator*(uint32_t lhs, Fixed rhs) {
return rhs * lhs;
}
friend Fixed operator/(Fixed lhs, uint32_t rhs) {
assert(rhs);
auto value = static_cast<T>(lhs.data_ / rhs);
return Fixed::make(value);
}
friend Fixed operator/(uint32_t lhs, Fixed rhs) {
return rhs / lhs;
}
friend Fixed operator<<(Fixed lhs, int32_t rhs) {
auto value = static_cast<T>(lhs.data_ << rhs);
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
return Fixed::make(value);
}
friend Fixed operator>>(Fixed lhs, int32_t rhs) {
auto value = static_cast<T>(lhs.data_ >> rhs);
return Fixed::make(value);
}
friend Fixed operator<<(Fixed lhs, uint32_t rhs) {
auto value = static_cast<T>(lhs.data_ << rhs);
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
return Fixed::make(value);
}
friend Fixed operator>>(Fixed lhs, uint32_t rhs) {
auto value = static_cast<T>(lhs.data_ >> rhs);
return Fixed::make(value);
}
static Fixed make(T value) {
Fixed ret;
ret.data_ = value;
return ret;
}
explicit operator int64_t() const {
return static_cast<int64_t>(data_ >> F);
}
explicit operator uint64_t() const {
return static_cast<uint64_t>(data_ >> F);
}
explicit operator int32_t() const {
return static_cast<int32_t>(data_ >> F);
}
explicit operator uint32_t() const {
return static_cast<uint32_t>(data_ >> F);
}
explicit operator int16_t() const {
return static_cast<int16_t>(data_ >> F);
}
explicit operator uint16_t() const {
return static_cast<uint16_t>(data_ >> F);
}
explicit operator int8_t() const {
return static_cast<int8_t>(data_ >> F);
}
explicit operator uint8_t() const {
return static_cast<uint8_t>(data_ >> F);
}
template <uint32_t F2, typename T2>
explicit operator Fixed<F2, T2>() const {
return Fixed<F2, T2>(*this);
}
explicit operator float() const {
return static_cast<float>(data_) / (static_cast<T>(1) << F);
}
T data() const {
return data_;
}
private:
T data_;
};

View File

@@ -5,10 +5,9 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <list> #include <list>
#include <queue>
#include <assert.h> #include <assert.h>
namespace vortex {
class SimObjectBase; class SimObjectBase;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -59,32 +58,44 @@ protected:
template <typename Pkt> template <typename Pkt>
class SimPort : public SimPortBase { class SimPort : public SimPortBase {
public: public:
void send(const Pkt& pkt, uint64_t delay) const; void send(const Pkt& pkt, uint64_t delay) const;
bool read(Pkt* out) { void bind(SimPort<Pkt>* peer) {
if (!valid_) this->connect(peer);
return false;
*out = data_;
valid_ = false;
return true;
} }
void unbind() {
this->disconnect();
}
bool empty() const {
return queue_.empty();
}
const Pkt& top() const {
return queue_.front();
}
Pkt& top() {
return queue_.front();
}
void pop() {
queue_.pop();
}
protected: protected:
SimPort(SimObjectBase* module) SimPort(SimObjectBase* module)
: SimPortBase(module) : SimPortBase(module)
, valid_(false)
{} {}
void write(const Pkt& data) { void push(const Pkt& data) {
assert(!valid_); queue_.push(data);
data_ = data;
valid_ = true;
} }
SimPort& operator=(const SimPort&) = delete; SimPort& operator=(const SimPort&) = delete;
Pkt data_; std::queue<Pkt> queue_;
bool valid_;
template <typename U> friend class SimPortEvent; template <typename U> friend class SimPortEvent;
}; };
@@ -94,15 +105,7 @@ protected:
template <typename Pkt> template <typename Pkt>
class SlavePort : public SimPort<Pkt> { class SlavePort : public SimPort<Pkt> {
public: public:
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {} SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
this->disconnect();
}
protected: protected:
SlavePort& operator=(const SlavePort&) = delete; SlavePort& operator=(const SlavePort&) = delete;
@@ -115,18 +118,6 @@ class MasterPort : public SimPort<Pkt> {
public: public:
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {} MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void bind(MasterPort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
this->disconnect();
}
protected: protected:
MasterPort& operator=(const MasterPort&) = delete; MasterPort& operator=(const MasterPort&) = delete;
}; };
@@ -194,7 +185,7 @@ public:
{} {}
void fire() const override { void fire() const override {
const_cast<SimPort<Pkt>*>(port_)->write(pkt_); const_cast<SimPort<Pkt>*>(port_)->push(pkt_);
} }
private: private:
@@ -382,6 +373,4 @@ template <typename T, typename Pkt>
void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
auto callback = std::bind(entry, obj, std::placeholders::_1); auto callback = std::bind(entry, obj, std::placeholders::_1);
SimPlatform::instance().schedule(callback, pkt, delay); SimPlatform::instance().schedule(callback, pkt, delay);
}
} }

221
sim/common/texturing.h Normal file
View File

@@ -0,0 +1,221 @@
#pragma once
#include <cstdint>
#include <cstdlib>
#include <fixed.h>
#include <bitmanip.h>
enum class WrapMode {
Clamp,
Repeat,
Mirror,
};
enum class TexFormat {
R8G8B8A8,
R5G6B5,
R4G4B4A4,
L8A8,
L8,
A8,
};
template <uint32_t F, typename T = int32_t>
T Clamp(Fixed<F,T> fx, WrapMode mode) {
switch (mode) {
case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
default:
std::abort();
return 0;
}
}
inline uint32_t Stride(TexFormat format) {
switch (format) {
case TexFormat::R8G8B8A8:
return 4;
case TexFormat::R5G6B5:
case TexFormat::R4G4B4A4:
case TexFormat::L8A8:
return 2;
case TexFormat::L8:
case TexFormat::A8:
return 1;
default:
std::abort();
return 0;
}
}
inline void Unpack8888(TexFormat format,
uint32_t texel,
uint32_t* lo,
uint32_t* hi) {
switch (format) {
case TexFormat::R8G8B8A8:
*lo = texel & 0x00ff00ff;
*hi = (texel >> 8) & 0x00ff00ff;
break;
case TexFormat::R5G6B5:
case TexFormat::R4G4B4A4:
*lo = texel;
*hi= 0;
break;
case TexFormat::L8A8:
*lo = (texel | (texel << 8)) & 0x00ff00ff;
*hi = 0;
break;
case TexFormat::L8:
*lo = (texel | (texel << 16)) & 0x07e0f81f;
*hi = 0;
break;
case TexFormat::A8:
*lo = (texel | (texel << 12)) & 0x0f0f0f0f;
*hi = 0;
break;
default:
std::abort();
}
}
inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) {
switch (format) {
case TexFormat::R8G8B8A8:
return (hi << 8) | lo;
case TexFormat::R5G6B5:
case TexFormat::R4G4B4A4:
return lo;
case TexFormat::L8A8:
return (lo | (lo >> 8)) & 0xffff;
case TexFormat::L8:
return (lo | (lo >> 16)) & 0xffff;
case TexFormat::A8:
return (lo | (lo >> 12)) & 0xffff;
default:
std::abort();
return 0;
}
}
inline void Lerp8888(uint32_t al,
uint32_t ah,
uint32_t bl,
uint32_t bh,
uint32_t frac,
uint32_t* lo,
uint32_t* hi) {
*lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
*hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
}
template <uint32_t F, typename T = int32_t>
void TexAddressLinear(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr00,
uint32_t* addr01,
uint32_t* addr10,
uint32_t* addr11,
uint32_t* alpha,
uint32_t* beta
) {
auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
uint32_t u0 = Clamp(fu - delta_x, wrapu);
uint32_t u1 = Clamp(fu + delta_x, wrapu);
uint32_t v0 = Clamp(fv - delta_y, wrapv);
uint32_t v1 = Clamp(fv + delta_y, wrapv);
uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
uint32_t x0s = (u0 << 8) >> shift_u;
uint32_t y0s = (v0 << 8) >> shift_v;
uint32_t x0 = x0s >> 8;
uint32_t y0 = y0s >> 8;
uint32_t x1 = u1 >> shift_u;
uint32_t y1 = v1 >> shift_v;
*addr00 = x0 + (y0 << log_width);
*addr01 = x1 + (y0 << log_width);
*addr10 = x0 + (y1 << log_width);
*addr11 = x1 + (y1 << log_width);
*alpha = x0s & 0xff;
*beta = y0s & 0xff;
//printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
}
template <uint32_t F, typename T = int32_t>
void TexAddressPoint(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr
) {
uint32_t u = Clamp(fu, wrapu);
uint32_t v = Clamp(fv, wrapv);
uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
*addr = x + (y << log_width);
//printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
}
inline uint32_t TexFilterLinear(
TexFormat format,
uint32_t texel00,
uint32_t texel01,
uint32_t texel10,
uint32_t texel11,
uint32_t alpha,
uint32_t beta
) {
uint32_t c01l, c01h;
{
uint32_t c0l, c0h;
uint32_t c1l, c1h;
Unpack8888(format, texel00, &c0l, &c0h);
Unpack8888(format, texel01, &c1l, &c1h);
Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h);
}
uint32_t c23l, c23h;
{
uint32_t c2l, c2h;
uint32_t c3l, c3h;
Unpack8888(format, texel10, &c2l, &c2h);
Unpack8888(format, texel11, &c3l, &c3h);
Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h);
}
uint32_t cl, ch;
Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch);
uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
//printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
return color;
}
inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
uint32_t cl, ch;
Unpack8888(format, texel, &cl, &ch);
uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
//printf("*** texel=0x%x, color=0x%x\n", texel, color);
return color;
}

View File

@@ -3,85 +3,12 @@
#include <cstdint> #include <cstdint>
#include <algorithm> #include <algorithm>
#include <assert.h> #include <assert.h>
#include <bitmanip.h>
template <typename... Args> template <typename... Args>
void unused(Args&&...) {} void unused(Args&&...) {}
#define __unused(...) unused(__VA_ARGS__) #define __unused(...) unused(__VA_ARGS__)
constexpr uint32_t count_leading_zeros(uint32_t value) {
return value ? __builtin_clz(value) : 32;
}
constexpr uint32_t count_trailing_zeros(uint32_t value) {
return value ? __builtin_ctz(value) : 32;
}
constexpr bool ispow2(uint32_t value) {
return value && !(value & (value - 1));
}
constexpr uint32_t log2ceil(uint32_t value) {
return 32 - count_leading_zeros(value - 1);
}
inline unsigned log2up(uint32_t value) {
return std::max<uint32_t>(1, log2ceil(value));
}
constexpr unsigned log2floor(uint32_t value) {
return 31 - count_leading_zeros(value);
}
constexpr unsigned ceil2(uint32_t value) {
return 32 - count_leading_zeros(value);
}
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits & ~(1ull << index);
}
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits | (1ull << index);
}
inline bool bit_get(uint64_t bits, uint32_t index) {
assert(index <= 63);
return (bits >> index) & 0x1;
}
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
return bits & ~mask;
}
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t dirty = (value << (shift + start)) >> shift;
return bit_clrw(bits, start, end) | dirty;
}
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
return (bits << shift) >> (shift + start);
}
// Apply integer sign extension
inline uint32_t sext32(uint32_t word, uint32_t width) {
assert(width > 1);
assert(width <= 32);
uint32_t mask = (1 << width) - 1;
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
}
// return file extension // return file extension
const char* fileExtension(const char* filepath); const char* fileExtension(const char* filepath);

View File

@@ -23,8 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
DBG_FLAGS += -DVCD_OUTPUT
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
@@ -51,10 +49,17 @@ VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
CXXFLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)
# Enable VCD trace
VCD_TRACE = -DVCD_OUTPUT
# Debugigng # Debugigng
ifdef DEBUG ifdef DEBUG
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
else else
VL_FLAGS += -DNDEBUG VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG CXXFLAGS += -O2 -DNDEBUG

View File

@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
TOP = vx_cache_sim TOP = vx_cache_sim
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS))) VPATH := $(sort $(dir $(SRCS)))

View File

@@ -13,6 +13,7 @@ struct params_t {
uint32_t sets_per_bank; uint32_t sets_per_bank;
uint32_t blocks_per_set; uint32_t blocks_per_set;
uint32_t words_per_block; uint32_t words_per_block;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start; uint32_t word_select_addr_start;
uint32_t word_select_addr_end; uint32_t word_select_addr_end;
@@ -31,8 +32,10 @@ struct params_t {
uint32_t offset_bits = config.B - config.W; uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits; uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A); uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B); assert(log2_bank_size >= config.B);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_block = 1 << offset_bits; this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A; this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits; this->sets_per_bank = 1 << index_bits;
@@ -104,7 +107,7 @@ struct set_t {
struct bank_req_info_t { struct bank_req_info_t {
bool valid; bool valid;
uint32_t req_id; uint32_t req_id;
uint32_t req_tag; uint64_t req_tag;
}; };
struct bank_req_t { struct bank_req_t {
@@ -194,7 +197,7 @@ public:
return root_entry; return root_entry;
} }
bool try_pop(bank_req_t* out) { bool pop(bank_req_t* out) {
for (auto& entry : entries_) { for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) { if (entry.valid && entry.mshr_replay) {
*out = entry; *out = entry;
@@ -208,16 +211,13 @@ public:
}; };
struct bank_t { struct bank_t {
std::vector<set_t> sets; std::vector<set_t> sets;
MSHR mshr; MSHR mshr;
std::queue<bank_req_t> stall_buffer;
bank_req_t active_req;
bank_t(const CacheConfig& config, bank_t(const CacheConfig& config,
const params_t& params) const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set) : sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size) , mshr(config.mshr_size)
, active_req(config.ports_per_bank)
{} {}
}; };
@@ -229,8 +229,8 @@ private:
CacheConfig config_; CacheConfig config_;
params_t params_; params_t params_;
std::vector<bank_t> banks_; std::vector<bank_t> banks_;
std::vector<std::queue<uint32_t>> core_rsps_; Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr mem_switch_; Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<MasterPort<MemReq>> mem_req_ports_; std::vector<MasterPort<MemReq>> mem_req_ports_;
std::vector<SlavePort<MemRsp>> mem_rsp_ports_; std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
@@ -240,241 +240,270 @@ public:
, config_(config) , config_(config)
, params_(config) , params_(config)
, banks_(config.num_banks, {config, params_}) , banks_(config.num_banks, {config, params_})
, core_rsps_(config.num_inputs)
, mem_req_ports_(config.num_banks, simobject) , mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject)
{ {
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
if (config.num_banks > 1) { if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
} }
mem_switch_->ReqOut.bind(&simobject->MemReqPort); mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
simobject->MemRspPort.bind(&mem_switch_->RspIn); bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
} else { } else {
mem_req_ports_.at(0).bind(&simobject->MemReqPort); mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
simobject->MemRspPort.bind(&mem_rsp_ports_.at(0)); bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
} }
} }
void step(uint64_t /*cycle*/) { void step(uint64_t /*cycle*/) {
// process core response // handle bypasss responses
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { auto& bypass_port = bypass_switch_->RspOut.at(1);
auto& core_rsp = core_rsps_.at(req_id); if (!bypass_port.empty()) {
if (!core_rsp.empty()) { auto& mem_rsp = bypass_port.top();
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
core_rsp.pop(); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
} MemRsp core_rsp(tag);
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
bypass_port.pop();
} }
for (auto& bank : banks_) { std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
auto& active_req = bank.active_req;
// try chedule mshr replay // handle MSHR replay
if (!active_req.valid) { for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
bank.mshr.try_pop(&active_req); auto& bank = banks_.at(bank_id);
} auto& pipeline_req = pipeline_reqs.at(bank_id);
bank.mshr.pop(&pipeline_req);
// try schedule stall queue if MSHR has space }
if (!active_req.valid
&& !bank.stall_buffer.empty()
&& !bank.mshr.full()) {
active_req = bank.stall_buffer.front();
bank.stall_buffer.pop();
}
}
// handle memory fills // handle memory fills
for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) { std::vector<bool> pending_fill_req(config_.num_banks, false);
MemRsp mem_rsp; for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
if (mem_rsp_ports_.at(i).read(&mem_rsp)) { auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
this->processMemoryFill(i, mem_rsp.tag); if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.top();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
} }
} }
// handle incoming core requests // handle incoming core requests
for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) { for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
MemReq core_req; auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (!simobject_->CoreReqPorts.at(i).read(&core_req)) if (core_req_port.empty())
continue; continue;
auto bank_id = params_.addr_bank_id(core_req.addr); auto& core_req = core_req_port.top();
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr); // check cache bypassing
auto port_id = i % config_.ports_per_bank; if (core_req.is_io) {
// send IO request
this->processIORequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// create abnk request // create bank request
bank_req_t bank_req(config_.ports_per_bank); bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true; bank_req.valid = true;
bank_req.write = core_req.write; bank_req.write = core_req.write;
bank_req.mshr_replay = false; bank_req.mshr_replay = false;
bank_req.tag = tag; bank_req.tag = tag;
bank_req.set_id = set_id; bank_req.set_id = set_id;
bank_req.infos.at(port_id) = {true, i, core_req.tag}; bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id); auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
// check MSHR capacity
if (bank.mshr.full()) { // check pending MSHR replay
// add to stall buffer if (pipeline_req.valid
bank.stall_buffer.emplace(bank_req); && pipeline_req.mshr_replay) {
// stall
continue;
}
// check pending fill request
if (pending_fill_req.at(bank_id)) {
// stall
continue; continue;
} }
auto& active_req = bank.active_req; // check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
// check pending MSHR request && bank.mshr.full()) {
if (active_req.valid // stall
&& active_req.mshr_replay) {
// add to stall buffer
bank.stall_buffer.emplace(bank_req);
continue; continue;
} }
// check bank conflicts // check bank conflicts
if (active_req.valid) { if (pipeline_req.valid) {
// check port conflict // check port conflict
if (active_req.write != core_req.write if (pipeline_req.write != core_req.write
|| active_req.set_id != set_id || pipeline_req.set_id != set_id
|| active_req.tag != tag || pipeline_req.tag != tag
|| active_req.infos[port_id].valid) { || pipeline_req.infos[port_id].valid) {
// add to stall buffer // stall
bank.stall_buffer.emplace(bank_req);
continue; continue;
} }
// update pending request infos // update pending request infos
active_req.infos[port_id] = bank_req.infos[port_id]; pipeline_req.infos[port_id] = bank_req.infos[port_id];
} else { } else {
// schedule new request // schedule new request
active_req = bank_req; pipeline_req = bank_req;
} }
// remove request
core_req_port.pop();
} }
// process active request // process active request
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { this->processBankRequest(pipeline_reqs);
this->processBankRequest(bank_id); }
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
}
if (core_req.write && config_.write_reponse) {
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);
} }
} }
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block // update block
auto& bank = banks_.at(bank_id); auto& bank = banks_.at(bank_id);
auto& root_entry = bank.mshr.replay(mshr_id); auto& entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(root_entry.set_id); auto& set = bank.sets.at(entry.set_id);
auto& block = set.blocks.at(root_entry.block_id); auto& block = set.blocks.at(entry.block_id);
block.valid = true; block.valid = true;
block.tag = root_entry.tag; block.tag = entry.tag;
} }
void processBankRequest(uint32_t bank_id) { void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
auto& bank = banks_.at(bank_id); for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& active_req = bank.active_req; auto& pipeline_req = pipeline_reqs.at(bank_id);
if (!active_req.valid) if (!pipeline_req.valid)
return; continue;
active_req.valid = false; auto& bank = banks_.at(bank_id);
auto& set = bank.sets.at(pipeline_req.set_id);
auto& set = bank.sets.at(active_req.set_id); if (pipeline_req.mshr_replay) {
// send core response
if (active_req.mshr_replay) { for (auto& info : pipeline_req.infos) {
// send core response simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
for (auto& info : active_req.infos) { }
core_rsps_.at(info.req_id).emplace(info.req_tag); } else {
} bool hit = false;
} else { bool found_free_block = false;
bool hit = false; int hit_block_id = 0;
bool found_free_block = false; int repl_block_id = 0;
int hit_block_id = 0; uint32_t max_cnt = 0;
int repl_block_id = 0;
uint32_t max_cnt = 0; for (int i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
for (int i = 0, n = set.blocks.size(); i < n; ++i) { if (block.valid) {
auto& block = set.blocks.at(i); if (block.tag == pipeline_req.tag) {
if (block.valid) { block.lru_ctr = 0;
if (block.tag == active_req.tag) { hit_block_id = i;
block.lru_ctr = 0; hit = true;
hit_block_id = i; } else {
hit = true; ++block.lru_ctr;
} else { }
++block.lru_ctr; if (max_cnt < block.lru_ctr) {
} max_cnt = block.lru_ctr;
if (max_cnt < block.lru_ctr) { repl_block_id = i;
max_cnt = block.lru_ctr; }
} else {
found_free_block = true;
repl_block_id = i; repl_block_id = i;
} }
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// MISS handling
//
if (active_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.tag = 0;
mem_req_ports_.at(bank_id).send(mem_req, 1);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
for (auto& info : active_req.infos) {
core_rsps_.at(info.req_id).emplace(info.req_tag);
}
} else {
//
// MISS handling
//
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req.tag = 0;
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
} }
if (active_req.write && config_.write_through) { if (hit) {
// forward write request to memory //
{ // MISS handling
MemReq mem_req; //
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); if (pipeline_req.write) {
mem_req.write = true; // handle write hit
mem_req.tag = 0; auto& hit_block = set.blocks.at(hit_block_id);
mem_req_ports_.at(bank_id).send(mem_req, 1); if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req_ports_.at(bank_id).send(mem_req, 1);
} else {
// mark block as dirty
hit_block.dirty = true;
}
} }
// send core response // send core response
for (auto& info : active_req.infos) { if (!pipeline_req.write || config_.write_reponse) {
core_rsps_.at(info.req_id).emplace(info.req_tag); for (auto& info : pipeline_req.infos) {
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
}
}
} else {
//
// MISS handling
//
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
} }
} else {
// lookup
int pending = bank.mshr.lookup(active_req);
// allocate MSHR if (pipeline_req.write && config_.write_through) {
int mshr_id = bank.mshr.allocate(active_req, repl_block_id); // forward write request to memory
{
// send fill request MemReq mem_req;
if (pending == -1) { mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
MemReq mem_req; mem_req.write = true;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); mem_req_ports_.at(bank_id).send(mem_req, 1);
mem_req.write = active_req.write; }
mem_req.tag = mshr_id; // send core response
mem_req_ports_.at(bank_id).send(mem_req, 1); if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
}
}
} else {
// MSHR lookup
int pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = pipeline_req.write;
mem_req.tag = mshr_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
} }
} }
} }

View File

@@ -14,7 +14,8 @@ struct CacheConfig {
uint8_t num_banks; // number of banks uint8_t num_banks; // number of banks
uint8_t ports_per_bank; // number of ports per bank uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs uint8_t num_inputs; // number of inputs
bool write_through; // is write-through cache bool write_through; // is write-through
bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency uint8_t latency; // pipeline latency

View File

@@ -10,11 +10,7 @@ namespace vortex {
struct Constants { struct Constants {
static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE; static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE;
static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1;
static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2;
static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2;
}; };

View File

@@ -19,6 +19,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, decoder_(arch) , decoder_(arch)
, mmu_(0, arch.wsize(), true) , mmu_(0, arch.wsize(), true)
, shared_mem_(4096) , shared_mem_(4096)
, tex_units_(NUM_TEX_UNITS, this)
, warps_(arch.num_warps()) , warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0) , barriers_(arch.num_barriers(), 0)
, csrs_(arch.num_csrs(), 0) , csrs_(arch.num_csrs(), 0)
@@ -35,7 +36,8 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
1, // number of banks 1, // number of banks
1, // number of ports 1, // number of ports
1, // request size 1, // request size
true, // write-throught true, // write-through
false, // write response
0, // victim size 0, // victim size
NUM_WARPS, // mshr NUM_WARPS, // mshr
2, // pipeline latency 2, // pipeline latency
@@ -49,12 +51,14 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
DCACHE_NUM_BANKS, // number of banks DCACHE_NUM_BANKS, // number of banks
DCACHE_NUM_PORTS, // number of ports DCACHE_NUM_PORTS, // number of ports
(uint8_t)arch.num_threads(), // request size (uint8_t)arch.num_threads(), // request size
true, // write-throught true, // write-through
false, // write response
0, // victim size 0, // victim size
DCACHE_MSHR_SIZE, // mshr DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency 2, // pipeline latency
})) }))
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
, dcache_switch_(arch.num_threads())
, fetch_stage_("fetch") , fetch_stage_("fetch")
, decode_stage_("decode") , decode_stage_("decode")
, issue_stage_("issue") , issue_stage_("issue")
@@ -65,10 +69,9 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, last_schedule_wid_(0) , last_schedule_wid_(0)
, issued_instrs_(0) , issued_instrs_(0)
, committed_instrs_(0) , committed_instrs_(0)
, ecall_(false)
, ebreak_(false) , ebreak_(false)
, stats_insts_(0) , stats_insts_(0)
, stats_loads_(0)
, stats_stores_(0)
, MemRspPort(this) , MemRspPort(this)
, MemReqPort(this) , MemReqPort(this)
{ {
@@ -92,6 +95,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
this->MemRspPort.bind(&l1_mem_switch_->RspIn); this->MemRspPort.bind(&l1_mem_switch_->RspIn);
l1_mem_switch_->ReqOut.bind(&this->MemReqPort); l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
// lsu/tex switch
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
auto& sw = dcache_switch_.at(i);
#ifdef EXT_TEX_ENABLE
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 2);
#else
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 1);
#endif
sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
}
// activate warp0 // activate warp0
warps_.at(0)->setTmask(0, true); warps_.at(0)->setTmask(0, true);
} }
@@ -147,44 +162,41 @@ void Core::warp_scheduler(uint64_t cycle) {
auto& warp = warps_.at(scheduled_warp); auto& warp = warps_.at(scheduled_warp);
stats_insts_ += warp->getActiveThreads(); stats_insts_ += warp->getActiveThreads();
pipeline_state_t state; auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_);
state.clear();
state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
warp->eval(&state); warp->eval(trace);
DT(3, cycle, "pipeline-schedule: " << state); DT(3, cycle, "pipeline-schedule: " << *trace);
// advance to fetch stage // advance to fetch stage
fetch_stage_.push(state); fetch_stage_.push(trace);
} }
void Core::fetch(uint64_t cycle) { void Core::fetch(uint64_t cycle) {
// handle icache reponse // handle icache reponse
{ auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
MemRsp mem_rsp; if (!icache_rsp_port.empty()){
if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){ auto& mem_rsp = icache_rsp_port.top();
pipeline_state_t state; auto trace = pending_icache_.at(mem_rsp.tag);
pending_icache_.remove(mem_rsp.tag, &state); auto latency = (SimPlatform::instance().cycles() - trace->icache_latency);
auto latency = (SimPlatform::instance().cycles() - state.icache_latency); trace->icache_latency = latency;
state.icache_latency = latency; decode_stage_.push(trace);
decode_stage_.push(state); DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state); pending_icache_.release(mem_rsp.tag);
} icache_rsp_port.pop();
} }
// send icache request // send icache request
{ if (!fetch_stage_.empty()) {
pipeline_state_t state; auto trace = fetch_stage_.top();
if (fetch_stage_.try_pop(&state)) { trace->icache_latency = SimPlatform::instance().cycles();
state.icache_latency = SimPlatform::instance().cycles(); MemReq mem_req;
MemReq mem_req; mem_req.addr = trace->PC;
mem_req.addr = state.PC; mem_req.write = false;
mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace);
mem_req.tag = pending_icache_.allocate(state); icache_->CoreReqPorts.at(0).send(mem_req, 1);
icache_->CoreReqPorts.at(0).send(mem_req, 1); DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state); fetch_stage_.pop();
}
} }
// schedule next warp // schedule next warp
@@ -194,19 +206,21 @@ void Core::fetch(uint64_t cycle) {
void Core::decode(uint64_t cycle) { void Core::decode(uint64_t cycle) {
__unused (cycle); __unused (cycle);
pipeline_state_t state; if (decode_stage_.empty())
if (!decode_stage_.try_pop(&state)) return;
return;
auto trace = decode_stage_.top();
// release warp // release warp
if (!state.stall_warp) { if (!trace->fetch_stall) {
stalled_warps_.reset(state.wid); stalled_warps_.reset(trace->wid);
} }
DT(3, cycle, "pipeline-decode: " << state); DT(3, cycle, "pipeline-decode: " << *trace);
// advance to issue stage // advance to issue stage
issue_stage_.push(state); issue_stage_.push(trace);
decode_stage_.pop();
} }
void Core::issue(uint64_t cycle) { void Core::issue(uint64_t cycle) {
@@ -214,12 +228,13 @@ void Core::issue(uint64_t cycle) {
if (!issue_stage_.empty()) { if (!issue_stage_.empty()) {
// insert to ibuffer // insert to ibuffer
auto& state = issue_stage_.top(); auto trace = issue_stage_.top();
auto& ibuffer = ibuffers_.at(state.wid); auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) { if (!trace->check_stalled(ibuffer.full())) {
DT(3, cycle, "*** ibuffer-stall: " << state); DT(3, cycle, "*** ibuffer-stall: " << *trace);
} else { }
ibuffer.push(state); if (!ibuffer.full()) {
ibuffer.push(trace);
issue_stage_.pop(); issue_stage_.pop();
} }
} }
@@ -229,27 +244,30 @@ void Core::issue(uint64_t cycle) {
if (ibuffer.empty()) if (ibuffer.empty())
continue; continue;
auto& state = ibuffer.top(); auto trace = ibuffer.top();
// check scoreboard // check scoreboard
if (scoreboard_.in_use(state)) { if (!trace->check_stalled(scoreboard_.in_use(trace))) {
DTH(3, cycle, "*** scoreboard-stall: dependents={"); DTH(3, cycle, "*** scoreboard-stall: dependents={");
auto owners = scoreboard_.owners(state); auto uses = scoreboard_.get_uses(trace);
for (uint32_t i = 0, n = owners.size(); i < n; ++i) { for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
if (i) DTN(3, ", "); auto& use = uses.at(i);
DTN(3, "#" << owners.at(i)); __unused(use);
if (i) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
} }
DTN(3, "}, " << state << std::endl); DTN(3, "}, " << *trace << std::endl);
continue;
} }
if (scoreboard_.in_use(trace))
continue;
DT(3, cycle, "pipeline-issue: " << state); DT(3, cycle, "pipeline-issue: " << *trace);
// update scoreboard // update scoreboard
scoreboard_.reserve(state); scoreboard_.reserve(trace);
// advance to execute stage // advance to execute stage
execute_stage_.push(state); execute_stage_.push(trace);
ibuffer.pop(); ibuffer.pop();
break; break;
@@ -259,11 +277,11 @@ void Core::issue(uint64_t cycle) {
void Core::execute(uint64_t cycle) { void Core::execute(uint64_t cycle) {
// process stage inputs // process stage inputs
if (!execute_stage_.empty()) { if (!execute_stage_.empty()) {
auto& state = execute_stage_.top(); auto trace = execute_stage_.top();
auto& exe_unit = exe_units_.at((int)state.exe_type); auto& exe_unit = exe_units_.at((int)trace->exe_type);
exe_unit->push_input(state); exe_unit->push(trace);
DT(3, cycle, "pipeline-execute: " << *trace);
execute_stage_.pop(); execute_stage_.pop();
DT(3, cycle, "pipeline-execute: " << state);
} }
// advance execute units // advance execute units
@@ -273,13 +291,14 @@ void Core::execute(uint64_t cycle) {
// commit completed instructions // commit completed instructions
for (auto& exe_unit : exe_units_) { for (auto& exe_unit : exe_units_) {
pipeline_state_t state; if (!exe_unit->empty()) {
if (exe_unit->pop_output(&state)) { auto trace = exe_unit->top();
if (state.stall_warp) { if (trace->fetch_stall) {
stalled_warps_.reset(state.wid); stalled_warps_.reset(trace->wid);
} }
// advance to commit stage // advance to commit stage
commit_stage_.push(state); commit_stage_.push(trace);
exe_unit->pop();
} }
} }
} }
@@ -287,21 +306,28 @@ void Core::execute(uint64_t cycle) {
void Core::commit(uint64_t cycle) { void Core::commit(uint64_t cycle) {
__unused (cycle); __unused (cycle);
pipeline_state_t state; if (commit_stage_.empty())
if (!commit_stage_.try_pop(&state))
return; return;
DT(3, cycle, "pipeline-commit: " << state); auto trace = commit_stage_.top();
DT(3, cycle, "pipeline-commit: " << *trace);
// update scoreboard // update scoreboard
scoreboard_.release(state); scoreboard_.release(trace);
assert(committed_instrs_ <= issued_instrs_); assert(committed_instrs_ <= issued_instrs_);
++committed_instrs_; ++committed_instrs_;
commit_stage_.pop();
// delete the trace
delete trace;
} }
bool Core::running() const { bool Core::running() const {
return (committed_instrs_ != issued_instrs_); bool is_running = (committed_instrs_ != issued_instrs_);
return is_running;
} }
Word Core::get_csr(Addr addr, int tid, int wid) { Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -355,6 +381,12 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
// NumCycles // NumCycles
return (Word)(SimPlatform::instance().cycles() >> 32); return (Word)(SimPlatform::instance().cycles() >> 32);
} else { } else {
if (addr >= CSR_TEX(0,0)
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
uint32_t unit = CSR_TEX_UNIT(addr);
uint32_t state = CSR_TEX_STATE(addr);
return tex_units_.at(unit).get_state(state);
}
return csrs_.at(addr); return csrs_.at(addr);
} }
} }
@@ -367,6 +399,13 @@ void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
} else if (addr == CSR_FCSR) { } else if (addr == CSR_FCSR) {
fcsrs_.at(wid) = value & 0xff; fcsrs_.at(wid) = value & 0xff;
} else { } else {
if (addr >= CSR_TEX(0,0)
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
uint32_t unit = CSR_TEX_UNIT(addr);
uint32_t state = CSR_TEX_STATE(addr);
tex_units_.at(unit).set_state(state, value);
return;
}
csrs_.at(addr) = value; csrs_.at(addr) = value;
} }
} }
@@ -390,29 +429,27 @@ Word Core::icache_read(Addr addr, Size size) {
return data; return data;
} }
Word Core::dcache_read(Addr addr, Size size) { Word Core::dcache_read(Addr addr, Size size) {
++stats_loads_;
Word data = 0; Word data = 0;
#ifdef SM_ENABLE if (SM_ENABLE) {
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
&& ((addr + 3) < SMEM_BASE_ADDR)) { && ((addr + 3) < SMEM_BASE_ADDR)) {
shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
return data; return data;
}
} }
#endif
mmu_.read(&data, addr, size, 0); mmu_.read(&data, addr, size, 0);
return data; return data;
} }
void Core::dcache_write(Addr addr, Word data, Size size) { void Core::dcache_write(Addr addr, Word data, Size size) {
++stats_stores_; if (SM_ENABLE) {
#ifdef SM_ENABLE if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) && ((addr + 3) < SMEM_BASE_ADDR)) {
&& ((addr + 3) < SMEM_BASE_ADDR)) { shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); return;
return; }
} }
#endif
if (addr >= IO_COUT_ADDR if (addr >= IO_COUT_ADDR
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
this->writeToStdOut(addr, data); this->writeToStdOut(addr, data);
@@ -421,11 +458,8 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
mmu_.write(&data, addr, size, 0); mmu_.write(&data, addr, size, 0);
} }
void Core::printStats() const { Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<uint64_t>* mem_addrs) {
std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl return tex_units_.at(unit).read(u, v, lod, mem_addrs);
<< "Insts : " << stats_insts_ << std::endl
<< "Loads : " << stats_loads_ << std::endl
<< "Stores: " << stats_stores_ << std::endl;
} }
void Core::writeToStdOut(Addr addr, Word data) { void Core::writeToStdOut(Addr addr, Word data) {
@@ -439,10 +473,14 @@ void Core::writeToStdOut(Addr addr, Word data) {
} }
} }
void Core::trigger_ecall() {
ecall_ = true;
}
void Core::trigger_ebreak() { void Core::trigger_ebreak() {
ebreak_ = true; ebreak_ = true;
} }
bool Core::check_ebreak() const { bool Core::check_exit() const {
return ebreak_; return ebreak_ || ecall_;
} }

View File

@@ -20,6 +20,7 @@
#include "ibuffer.h" #include "ibuffer.h"
#include "scoreboard.h" #include "scoreboard.h"
#include "exeunit.h" #include "exeunit.h"
#include "tex_unit.h"
namespace vortex { namespace vortex {
@@ -34,8 +35,6 @@ public:
void step(uint64_t cycle); void step(uint64_t cycle);
void printStats() const;
Word id() const { Word id() const {
return id_; return id_;
} }
@@ -72,9 +71,13 @@ public:
void dcache_write(Addr, Word, Size); void dcache_write(Addr, Word, Size);
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<uint64_t>* mem_addrs);
void trigger_ecall();
void trigger_ebreak(); void trigger_ebreak();
bool check_ebreak() const; bool check_exit() const;
private: private:
@@ -92,10 +95,8 @@ private:
const ArchDef arch_; const ArchDef arch_;
const Decoder decoder_; const Decoder decoder_;
MemoryUnit mmu_; MemoryUnit mmu_;
#ifdef SM_ENABLE
RAM shared_mem_; RAM shared_mem_;
#endif std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_; std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_; std::vector<WarpMask> barriers_;
@@ -107,6 +108,7 @@ private:
Cache::Ptr icache_; Cache::Ptr icache_;
Cache::Ptr dcache_; Cache::Ptr dcache_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_; Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
PipelineStage fetch_stage_; PipelineStage fetch_stage_;
PipelineStage decode_stage_; PipelineStage decode_stage_;
@@ -114,20 +116,20 @@ private:
PipelineStage execute_stage_; PipelineStage execute_stage_;
PipelineStage commit_stage_; PipelineStage commit_stage_;
HashTable<pipeline_state_t> pending_icache_; HashTable<pipeline_trace_t*> pending_icache_;
WarpMask stalled_warps_; WarpMask stalled_warps_;
uint32_t last_schedule_wid_; uint32_t last_schedule_wid_;
uint32_t issued_instrs_; uint32_t issued_instrs_;
uint32_t committed_instrs_; uint32_t committed_instrs_;
bool ecall_;
bool ebreak_; bool ebreak_;
std::unordered_map<int, std::stringstream> print_bufs_; std::unordered_map<int, std::stringstream> print_bufs_;
uint64_t stats_insts_; uint64_t stats_insts_;
uint64_t stats_loads_;
uint64_t stats_stores_;
friend class LsuUnit; friend class LsuUnit;
friend class GpuUnit;
public: public:
SlavePort<MemRsp> MemRspPort; SlavePort<MemRsp> MemRspPort;

View File

@@ -41,14 +41,18 @@ static const std::unordered_map<int, struct InstTableEntry_t> sc_instTable = {
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}}, {Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}}, {Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}}, {Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
}; };
static const char* op_string(const Instr &instr) { static const char* op_string(const Instr &instr) {
Word func3 = instr.getFunc3(); auto opcode = instr.getOpcode();
Word func7 = instr.getFunc7(); Word func2 = instr.getFunc2();
Word rs2 = instr.getRSrc(1); Word func3 = instr.getFunc3();
Word imm = instr.getImm(); Word func7 = instr.getFunc7();
switch (instr.getOpcode()) { Word rs2 = instr.getRSrc(1);
Word imm = instr.getImm();
switch (opcode) {
case Opcode::NOP: return "NOP"; case Opcode::NOP: return "NOP";
case Opcode::LUI_INST: return "LUI"; case Opcode::LUI_INST: return "LUI";
case Opcode::AUIPC_INST: return "AUIPC"; case Opcode::AUIPC_INST: return "AUIPC";
@@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) {
} }
case Opcode::SYS_INST: case Opcode::SYS_INST:
switch (func3) { switch (func3) {
case 0: return imm ? "EBREAK" : "ECALL"; case 0:
switch (imm) {
case 0x000: return "ECALL";
case 0x001: return "EBREAK";
case 0x002: return "URET";
case 0x102: return "SRET";
case 0x302: return "MRET";
default:
std::abort();
}
case 1: return "CSRRW"; case 1: return "CSRRW";
case 2: return "CSRRS"; case 2: return "CSRRS";
case 3: return "CSRRC"; case 3: return "CSRRC";
@@ -181,29 +194,43 @@ static const char* op_string(const Instr &instr) {
case 1: return "WSPAWN"; case 1: return "WSPAWN";
case 2: return "SPLIT"; case 2: return "SPLIT";
case 3: return "JOIN"; case 3: return "JOIN";
case 4: return "BAR"; case 4: return "BAR";
case 6: return "PREFETCH"; default:
std::abort();
}
case Opcode::GPU:
switch (func3) {
case 0: return "TEX";
case 1: {
switch (func2) {
case 0: return "CMOV";
default:
std::abort();
}
}
default: default:
std::abort(); std::abort();
} }
default: default:
std::abort(); std::abort();
} }
} }
namespace vortex { namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) { std::ostream &operator<<(std::ostream &os, const Instr &instr) {
os << op_string(instr) << ": ";
auto opcode = instr.getOpcode(); auto opcode = instr.getOpcode();
Word func2 = instr.getFunc2();
Word func3 = instr.getFunc3();
os << op_string(instr) << ": ";
if (opcode == S_INST if (opcode == S_INST
|| opcode == FS || opcode == FS) {
|| opcode == VS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
os << instr.getRSType(1) << std::dec << instr.getRSrc(1); os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else } else
if (opcode == L_INST if (opcode == L_INST
|| opcode == FL || opcode == FL) {
|| opcode == VL) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else { } else {
@@ -219,8 +246,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
if (i) os << ", "; if (i) os << ", ";
os << "imm=0x" << std::hex << instr.getImm(); os << "imm=0x" << std::hex << instr.getImm();
} }
} if (opcode == GPU && func3 == 0) {
os << ", unit=" << std::dec << func2;
}
}
return os; return os;
} }
} }
@@ -239,6 +268,7 @@ Decoder::Decoder(const ArchDef &arch) {
shift_func3_ = shift_rd_ + reg_s_; shift_func3_ = shift_rd_ + reg_s_;
shift_rs1_ = shift_func3_ + func3_s_; shift_rs1_ = shift_func3_ + func3_s_;
shift_rs2_ = shift_rs1_ + reg_s_; shift_rs2_ = shift_rs1_ + reg_s_;
shift_func2_ = shift_rs2_ + reg_s_;
shift_func7_ = shift_rs2_ + reg_s_; shift_func7_ = shift_rs2_ + reg_s_;
shift_rs3_ = shift_func7_ + func2_s_; shift_rs3_ = shift_func7_ + func2_s_;
shift_vmop_ = shift_func7_ + vmask_s_; shift_vmop_ = shift_func7_ + vmask_s_;
@@ -247,7 +277,7 @@ Decoder::Decoder(const ArchDef &arch) {
shift_vset_ = shift_func7_ + 6; shift_vset_ = shift_func7_ + 6;
reg_mask_ = 0x1f; reg_mask_ = 0x1f;
func2_mask_ = 0x2; func2_mask_ = 0x3;
func3_mask_ = 0x7; func3_mask_ = 0x7;
func6_mask_ = 0x3f; func6_mask_ = 0x3f;
func7_mask_ = 0x7f; func7_mask_ = 0x7f;
@@ -265,6 +295,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
instr->setOpcode(op); instr->setOpcode(op);
Word func2 = (code >> shift_func2_) & func2_mask_;
Word func3 = (code >> shift_func3_) & func3_mask_; Word func3 = (code >> shift_func3_) & func3_mask_;
Word func6 = (code >> shift_func6_) & func6_mask_; Word func6 = (code >> shift_func6_) & func6_mask_;
Word func7 = (code >> shift_func7_) & func7_mask_; Word func7 = (code >> shift_func7_) & func7_mask_;
@@ -403,7 +434,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
} }
} break; } break;
case Opcode::VL: case Opcode::FL:
instr->setDestVReg(rd); instr->setDestVReg(rd);
instr->setSrcVReg(rs1); instr->setSrcVReg(rs1);
instr->setVlsWidth(func3); instr->setVlsWidth(func3);
@@ -413,7 +444,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
instr->setVnf((code >> shift_vnf_) & func3_mask_); instr->setVnf((code >> shift_vnf_) & func3_mask_);
break; break;
case Opcode::VS: case Opcode::FS:
instr->setVs3(rd); instr->setVs3(rd);
instr->setSrcVReg(rs1); instr->setSrcVReg(rs1);
instr->setVlsWidth(func3); instr->setVlsWidth(func3);
@@ -428,10 +459,18 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
} }
break; break;
case R4_TYPE: case R4_TYPE:
instr->setDestFReg(rd); if (op == Opcode::GPU) {
instr->setSrcFReg(rs1); instr->setDestReg(rd);
instr->setSrcFReg(rs2); instr->setSrcReg(rs1);
instr->setSrcFReg(rs3); instr->setSrcReg(rs2);
instr->setSrcReg(rs3);
} else {
instr->setDestFReg(rd);
instr->setSrcFReg(rs1);
instr->setSrcFReg(rs2);
instr->setSrcFReg(rs3);
}
instr->setFunc2(func2);
instr->setFunc3(func3); instr->setFunc3(func3);
break; break;
default: default:

View File

@@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid)
} }
} }
void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
assert(tmask_.any()); assert(tmask_.any());
Word nextPC = PC_ + core_->arch().wsize(); Word nextPC = PC_ + core_->arch().wsize();
Word func2 = instr.getFunc2();
Word func3 = instr.getFunc3(); Word func3 = instr.getFunc3();
Word func6 = instr.getFunc6(); Word func6 = instr.getFunc6();
Word func7 = instr.getFunc7(); Word func7 = instr.getFunc7();
@@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case NOP: case NOP:
break; break;
case LUI_INST: case LUI_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH; trace->alu.type = AluType::ARITH;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true; rd_write = true;
break; break;
case AUIPC_INST: case AUIPC_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH; trace->alu.type = AluType::ARITH;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true; rd_write = true;
break; break;
case R_INST: case R_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH; trace->alu.type = AluType::ARITH;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
pipeline_state->used_iregs[rsrc1] = 1; trace->used_iregs.set(rsrc1);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case 0: case 0:
// MUL // MUL
rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]);
pipeline_state->alu.type = AluType::IMUL; trace->alu.type = AluType::IMUL;
break; break;
case 1: { case 1: {
// MULH // MULH
@@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
uint64_t result = first * second; uint64_t result = first * second;
rddata[t] = (result >> 32) & 0xFFFFFFFF; rddata[t] = (result >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL; trace->alu.type = AluType::IMUL;
} break; } break;
case 2: { case 2: {
// MULHSU // MULHSU
@@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
int64_t second = (int64_t)rsdata[t][1]; int64_t second = (int64_t)rsdata[t][1];
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL; trace->alu.type = AluType::IMUL;
} break; } break;
case 3: { case 3: {
// MULHU // MULHU
uint64_t first = (uint64_t)rsdata[t][0]; uint64_t first = (uint64_t)rsdata[t][0];
uint64_t second = (uint64_t)rsdata[t][1]; uint64_t second = (uint64_t)rsdata[t][1];
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
pipeline_state->alu.type = AluType::IMUL; trace->alu.type = AluType::IMUL;
} break; } break;
case 4: { case 4: {
// DIV // DIV
@@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else { } else {
rddata[t] = dividen / divisor; rddata[t] = dividen / divisor;
} }
pipeline_state->alu.type = AluType::IDIV; trace->alu.type = AluType::IDIV;
} break; } break;
case 5: { case 5: {
// DIVU // DIVU
@@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else { } else {
rddata[t] = dividen / divisor; rddata[t] = dividen / divisor;
} }
pipeline_state->alu.type = AluType::IDIV; trace->alu.type = AluType::IDIV;
} break; } break;
case 6: { case 6: {
// REM // REM
@@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else { } else {
rddata[t] = dividen % divisor; rddata[t] = dividen % divisor;
} }
pipeline_state->alu.type = AluType::IDIV; trace->alu.type = AluType::IDIV;
} break; } break;
case 7: { case 7: {
// REMU // REMU
@@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else { } else {
rddata[t] = dividen % divisor; rddata[t] = dividen % divisor;
} }
pipeline_state->alu.type = AluType::IDIV; trace->alu.type = AluType::IDIV;
} break; } break;
default: default:
std::abort(); std::abort();
@@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true; rd_write = true;
break; break;
case I_INST: case I_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::ARITH; trace->alu.type = AluType::ARITH;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rd_write = true; rd_write = true;
break; break;
case B_INST: case B_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH; trace->alu.type = AluType::BRANCH;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
pipeline_state->used_iregs[rsrc1] = 1; trace->used_iregs.set(rsrc1);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
break; // runonce break; // runonce
} }
pipeline_state->stall_warp = true; trace->fetch_stall = true;
break; break;
case JAL_INST: case JAL_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH; trace->alu.type = AluType::BRANCH;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
rddata[t] = nextPC; rddata[t] = nextPC;
nextPC = PC_ + immsrc; nextPC = PC_ + immsrc;
pipeline_state->stall_warp = true; trace->fetch_stall = true;
break; // runonce break; // runonce
} }
rd_write = true; rd_write = true;
break; break;
case JALR_INST: case JALR_INST:
pipeline_state->exe_type = ExeType::ALU; trace->exe_type = ExeType::ALU;
pipeline_state->alu.type = AluType::BRANCH; trace->alu.type = AluType::BRANCH;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
rddata[t] = nextPC; rddata[t] = nextPC;
nextPC = rsdata[t][0] + immsrc; nextPC = rsdata[t][0] + immsrc;
pipeline_state->stall_warp = true; trace->fetch_stall = true;
break; // runOnce break; // runOnce
} }
rd_write = true; rd_write = true;
break; break;
case L_INST: case L_INST:
pipeline_state->exe_type = ExeType::LSU; case FL:
pipeline_state->lsu.type = LsuType::LOAD; trace->exe_type = ExeType::LSU;
pipeline_state->used_iregs[rsrc0] = 1; trace->lsu.type = LsuType::LOAD;
pipeline_state->mem_addrs.resize(num_threads); trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) { if (opcode == L_INST
if (!tmask_.test(t)) || (opcode == FL && func3 == 2)) {
continue; for (int t = 0; t < num_threads; ++t) {
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned if (!tmask_.test(t))
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; continue;
Word data_read = core_->dcache_read(memAddr, 4); Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
pipeline_state->mem_addrs.at(t) = memAddr; Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); Word data_read = core_->dcache_read(memAddr, 4);
switch (func3) { trace->mem_addrs.at(t).push_back(memAddr);
case 0: DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
// LBI switch (func3) {
rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); case 0:
break; // LBI
case 1: rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
// LHI break;
rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); case 1:
break; // LHI
case 2: rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
// LW break;
rddata[t] = data_read; case 2:
break; // LW
case 4: rddata[t] = data_read;
// LBU break;
rddata[t] = Word((data_read >> shift_by) & 0xFF); case 4:
break; // LBU
case 5: rddata[t] = Word((data_read >> shift_by) & 0xFF);
// LHU break;
rddata[t] = Word((data_read >> shift_by) & 0xFFFF); case 5:
break; // LHU
default: rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
std::abort(); break;
default:
std::abort();
}
} }
} } else {
rd_write = true; DP(4, "Executing vector load");
break; DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
case S_INST: DP(4, "dest: v" << rdest);
pipeline_state->exe_type = ExeType::LSU; DP(4, "width" << instr.getVlsWidth());
pipeline_state->lsu.type = LsuType::STORE; auto &vd = vRegFile_.at(rdest);
pipeline_state->used_iregs[rsrc0] = 1; switch (instr.getVlsWidth()) {
pipeline_state->used_iregs[rsrc1] = 1; case 6: {
pipeline_state->mem_addrs.resize(num_threads); // load word and unit strided (not checking for unit stride)
for (int t = 0; t < num_threads; ++t) { for (int i = 0; i < vl_; i++) {
if (!tmask_.test(t)) Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
continue; DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr);
Word memAddr = rsdata[t][0] + immsrc; Word data_read = core_->dcache_read(memAddr, 4);
pipeline_state->mem_addrs.at(t) = memAddr; DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); int *result_ptr = (int *)(vd.data() + i);
switch (func3) { *result_ptr = data_read;
case 0: }
// SB } break;
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
break;
case 1:
// SH
core_->dcache_write(memAddr, rsdata[t][1], 2);
break;
case 2:
// SW
core_->dcache_write(memAddr, rsdata[t][1], 4);
break;
default: default:
std::abort(); std::abort();
} }
} }
rd_write = true;
break;
case S_INST:
case FS:
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::STORE;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
if (opcode == S_INST
|| (opcode == FS && func3 == 2)) {
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
trace->mem_addrs.at(t).push_back(memAddr);
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (func3) {
case 0:
// SB
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
break;
case 1:
// SH
core_->dcache_write(memAddr, rsdata[t][1], 2);
break;
case 2:
// SW
core_->dcache_write(memAddr, rsdata[t][1], 4);
break;
default:
std::abort();
}
}
} else {
for (int i = 0; i < vl_; i++) {
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (instr.getVlsWidth()) {
case 6: {
// store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
DP(4, "store: " << memAddr << " value:" << value);
} break;
default:
std::abort();
}
}
}
break; break;
case SYS_INST: case SYS_INST:
pipeline_state->exe_type = ExeType::CSR; trace->exe_type = ExeType::CSR;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
Word csr_value = core_->get_csr(csr_addr, t, id_); Word csr_value = core_->get_csr(csr_addr, t, id_);
switch (func3) { switch (func3) {
case 0: case 0:
if (csr_addr < 2) { switch (csr_addr) {
// ECALL/EBREAK case 0: // ECALL
core_->trigger_ecall();
break;
case 1: // EBREAK
core_->trigger_ebreak(); core_->trigger_ebreak();
} break;
case 0x002: // URET
case 0x102: // SRET
case 0x302: // MRET
break;
default:
std::abort();
}
break; break;
case 1: case 1:
// CSRRW // CSRRW
rddata[t] = csr_value; rddata[t] = csr_value;
core_->set_csr(csr_addr, rsdata[t][0], t, id_); core_->set_csr(csr_addr, rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
rd_write = true; rd_write = true;
break; break;
case 2: case 2:
// CSRRS // CSRRS
rddata[t] = csr_value; rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
rd_write = true; rd_write = true;
break; break;
case 3: case 3:
// CSRRC // CSRRC
rddata[t] = csr_value; rddata[t] = csr_value;
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
rd_write = true; rd_write = true;
break; break;
case 5: case 5:
@@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
break; break;
case FENCE: case FENCE:
pipeline_state->exe_type = ExeType::LSU; trace->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::FENCE; trace->lsu.type = LsuType::FENCE;
pipeline_state->stall_warp = true; trace->fetch_stall = true;
break; break;
case (FL | VL):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::LOAD;
pipeline_state->used_iregs[rsrc0] = 1;
if (func3 == 0x2) {
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
Word data_read = core_->dcache_read(memAddr, 4);
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
rddata[t] = data_read;
}
} else {
DP(3, "Executing vector load");
DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
DP(3, "dest: v" << rdest);
DP(3, "width" << instr.getVlsWidth());
pipeline_state->mem_addrs.resize(vl_);
auto &vd = vRegFile_.at(rdest);
switch (instr.getVlsWidth()) {
case 6: {
// load word and unit strided (not checking for unit stride)
for (int i = 0; i < vl_; i++) {
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
Word data_read = core_->dcache_read(memAddr, 4);
DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
int *result_ptr = (int *)(vd.data() + i);
*result_ptr = data_read;
}
} break;
default:
std::abort();
}
break;
}
rd_write = true;
break;
case (FS | VS):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::STORE;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
if (func3 == 0x2) {
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
core_->dcache_write(memAddr, rsdata[t][1], 4);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
}
} else {
pipeline_state->mem_addrs.resize(vl_);
for (int i = 0; i < vl_; i++) {
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (instr.getVlsWidth()) {
case 6: {
//store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
DP(3, "store: " << memAddr << " value:" << value);
} break;
default:
std::abort();
}
}
}
break;
case FCI: case FCI:
pipeline_state->exe_type = ExeType::FPU; trace->exe_type = ExeType::FPU;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
switch (func7) { switch (func7) {
case 0x00: //FADD case 0x00: //FADD
rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA; trace->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x04: //FSUB case 0x04: //FSUB
rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA; trace->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x08: //FMUL case 0x08: //FMUL
rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FMA; trace->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x0c: //FDIV case 0x0c: //FDIV
rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags);
pipeline_state->fpu.type = FpuType::FDIV; trace->fpu.type = FpuType::FDIV;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x2c: //FSQRT case 0x2c: //FSQRT
rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags);
pipeline_state->fpu.type = FpuType::FSQRT; trace->fpu.type = FpuType::FSQRT;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
break; break;
case 0x10: case 0x10:
switch (func3) { switch (func3) {
@@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]);
break; break;
} }
pipeline_state->fpu.type = FpuType::FNCP; trace->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x14: case 0x14:
if (func3) { if (func3) {
@@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FMIN.S // FMIN.S
rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags);
} }
pipeline_state->fpu.type = FpuType::FNCP; trace->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x60: case 0x60:
if (rsrc1 == 0) { if (rsrc1 == 0) {
@@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FCVT.WU.S // FCVT.WU.S
rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags);
} }
pipeline_state->fpu.type = FpuType::FCVT; trace->fpu.type = FpuType::FCVT;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
break; break;
case 0x70: case 0x70:
if (func3) { if (func3) {
@@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FMV.X.W // FMV.X.W
rddata[t] = rsdata[t][0]; rddata[t] = rsdata[t][0];
} }
pipeline_state->fpu.type = FpuType::FNCP; trace->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
break; break;
case 0x50: case 0x50:
switch(func3) { switch(func3) {
@@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags);
break; break;
} }
pipeline_state->fpu.type = FpuType::FNCP; trace->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
break; break;
case 0x68: case 0x68:
if (rsrc1) { if (rsrc1) {
@@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
// FCVT.S.W: // FCVT.S.W:
rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); rddata[t] = rv_itof(rsdata[t][0], frm, &fflags);
} }
pipeline_state->fpu.type = FpuType::FCVT; trace->fpu.type = FpuType::FCVT;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
break; break;
case 0x78: case 0x78:
// FMV.W.X // FMV.W.X
rddata[t] = rsdata[t][0]; rddata[t] = rsdata[t][0];
pipeline_state->fpu.type = FpuType::FNCP; trace->fpu.type = FpuType::FNCP;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
break; break;
} }
update_fcrs(fflags, core_, t, id_); update_fcrs(fflags, core_, t, id_);
@@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
case FMSUB: case FMSUB:
case FMNMADD: case FMNMADD:
case FMNMSUB: case FMNMSUB:
pipeline_state->fpu.type = FpuType::FMA; trace->fpu.type = FpuType::FMA;
pipeline_state->used_fregs[rsrc0] = 1; trace->used_fregs.set(rsrc0);
pipeline_state->used_fregs[rsrc1] = 1; trace->used_fregs.set(rsrc1);
pipeline_state->used_fregs[rsrc2] = 1; trace->used_fregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
rd_write = true; rd_write = true;
break; break;
case GPGPU: { case GPGPU: {
pipeline_state->exe_type = ExeType::GPU;
int ts = 0; int ts = 0;
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (tmask_.test(t)) { if (tmask_.test(t)) {
@@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
switch (func3) { switch (func3) {
case 0: { case 0: {
// TMC // TMC
pipeline_state->gpu.type = GpuType::TMC; trace->exe_type = ExeType::GPU;
pipeline_state->used_iregs[rsrc0] = 1; trace->gpu.type = GpuType::TMC;
pipeline_state->stall_warp = true; trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (rsrc1) { if (rsrc1) {
// predicate mode // predicate mode
ThreadMask pred; ThreadMask pred;
@@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break; } break;
case 1: { case 1: {
// WSPAWN // WSPAWN
pipeline_state->gpu.type = GpuType::WSPAWN; trace->exe_type = ExeType::GPU;
pipeline_state->used_iregs[rsrc0] = 1; trace->gpu.type = GpuType::WSPAWN;
pipeline_state->used_iregs[rsrc1] = 1; trace->used_iregs.set(rsrc0);
pipeline_state->stall_warp = true; trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps()); int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
for (int i = 1; i < active_warps; ++i) { for (int i = 1; i < active_warps; ++i) {
@@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break; } break;
case 2: { case 2: {
// SPLIT // SPLIT
pipeline_state->gpu.type = GpuType::SPLIT; trace->exe_type = ExeType::GPU;
pipeline_state->used_iregs[rsrc0] = 1; trace->gpu.type = GpuType::SPLIT;
pipeline_state->stall_warp = true; trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
ThreadMask tmask; ThreadMask tmask;
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
@@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break; } break;
case 3: { case 3: {
// JOIN // JOIN
pipeline_state->gpu.type = GpuType::JOIN; trace->exe_type = ExeType::GPU;
pipeline_state->stall_warp = true; trace->gpu.type = GpuType::JOIN;
trace->fetch_stall = true;
if (!domStack_.empty() && domStack_.top().unanimous) { if (!domStack_.empty() && domStack_.top().unanimous) {
DP(3, "*** Uninimous branch at join"); DP(3, "*** Uninimous branch at join");
tmask_ = domStack_.top().tmask; tmask_ = domStack_.top().tmask;
@@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} break; } break;
case 4: { case 4: {
// BAR // BAR
pipeline_state->gpu.type = GpuType::BAR; trace->exe_type = ExeType::GPU;
pipeline_state->used_iregs[rsrc0] = 1; trace->gpu.type = GpuType::BAR;
pipeline_state->used_iregs[rsrc1] = 1; trace->used_iregs.set(rsrc0);
pipeline_state->stall_warp = true; trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
active_ = false; active_ = false;
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
} break; } break;
case 6: { case 5: {
// PREFETCH // PREFETCH
pipeline_state->exe_type = ExeType::LSU; trace->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::PREFETCH; trace->lsu.type = LsuType::PREFETCH;
pipeline_state->used_iregs[rsrc0] = 1; trace->used_iregs.set(rsrc0);
for (int t = 0; t < num_threads; ++t) { for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t)) if (!tmask_.test(t))
continue; continue;
@@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
default: default:
std::abort(); std::abort();
} }
} break; } break;
case GPU: {
switch (func3) {
case 0: { // TEX
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::TEX;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto unit = func2;
auto u = rsdata[t][0];
auto v = rsdata[t][1];
auto lod = rsdata[t][2];
auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t));
rddata[t] = color;
}
rd_write = true;
} break;
case 1:
switch (func2) {
case 0: { // CMOV
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::CMOV;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2];
}
rd_write = true;
} break;
default:
std::abort();
}
break;
default:
std::abort();
}
} break;
case VSET: { case VSET: {
int VLEN = core_->arch().vsize() * 8; int VLEN = core_->arch().vsize() * 8;
int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 24: { case 24: {
//vmseq // vmseq
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 25: { case 25: {
//vmsne // vmsne
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 26: { case 26: {
//vmsltu // vmsltu
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 27: { case 27: {
//vmslt // vmslt
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 28: { case 28: {
//vmsleu // vmsleu
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 29: { case 29: {
//vmsle // vmsle
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 30: { case 30: {
//vmsgtu // vmsgtu
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 31: { case 31: {
//vmsgt // vmsgt
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 27: { case 27: {
//vmxor // vmxor
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 28: { case 28: {
//vmornot // vmornot
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 29: { case 29: {
//vmnand // vmnand
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 30: { case 30: {
//vmnor // vmnor
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 31: { case 31: {
//vmxnor // vmxnor
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
} break; } break;
case 37: { case 37: {
//vmul // vmul
auto &vr1 = vRegFile_.at(rsrc0); auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1); auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest); auto &vd = vRegFile_.at(rdest);
@@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} }
if (rd_write) { if (rd_write) {
pipeline_state->wb = true; trace->wb = true;
DPH(2, "Dest Reg: "); DPH(2, "Dest Reg: ");
auto rdt = instr.getRDType(); auto rdt = instr.getRDType();
switch (rdt) { switch (rdt) {
@@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
DPN(2, "0x" << std::hex << rddata[t]); DPN(2, "0x" << std::hex << rddata[t]);
} }
DPN(2, "}" << std::endl); DPN(2, "}" << std::endl);
pipeline_state->used_iregs[rdest] = 1; trace->used_iregs[rdest] = 1;
} }
break; break;
case RegType::Float: case RegType::Float:
@@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
DPN(2, "0x" << std::hex << rddata[t]); DPN(2, "0x" << std::hex << rddata[t]);
} }
DPN(2, "}" << std::endl); DPN(2, "}" << std::endl);
pipeline_state->used_fregs[rdest] = 1; trace->used_fregs[rdest] = 1;
break; break;
default: default:
std::abort(); std::abort();

View File

@@ -6,16 +6,18 @@
#include <util.h> #include <util.h>
#include "debug.h" #include "debug.h"
#include "core.h" #include "core.h"
#include "constants.h"
using namespace vortex; using namespace vortex;
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
void NopUnit::step(uint64_t /*cycle*/) { void NopUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state; if (inputs_.empty())
if (!inputs_.try_pop(&state))
return; return;
this->schedule_output(state, 1); auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) {
// handle dcache response // handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) { for (uint32_t t = 0; t < num_threads_; ++t) {
MemRsp mem_rsp; auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp)) if (dcache_rsp_port.empty())
continue; continue;
auto& entry = pending_dcache_.at(mem_rsp.tag); auto& mem_rsp = dcache_rsp_port.top();
DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first); auto& entry = pending_dcache_.at(mem_rsp.tag);
assert(entry.second.test(t)); auto trace = entry.first;
entry.second.reset(t); // track remaining blocks DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
if (!entry.second.any()) { << ", tid=" << t << ", " << *trace);
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); assert(entry.second);
entry.first.dcache_latency = latency; --entry.second; // track remaining blocks
this->schedule_output(entry.first, 1); if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
pending_dcache_.release(mem_rsp.tag); pending_dcache_.release(mem_rsp.tag);
} }
dcache_rsp_port.pop();
} }
if (fence_lock_) { if (fence_lock_) {
@@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) {
if (inputs_.empty()) if (inputs_.empty())
return; return;
auto state = inputs_.top(); auto trace = inputs_.top();
if (state.lsu.type == LsuType::FENCE) { if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock // schedule fence lock
fence_state_ = state; fence_state_ = trace;
fence_lock_ = true; fence_lock_ = true;
inputs_.pop(); DT(3, cycle, "fence-lock: " << *trace);
DT(3, cycle, "fence-lock: " << state); // remove input
inputs_.pop();
return; return;
} }
// check pending queue capacity // check pending queue capacity
if (pending_dcache_.full()) { if (!trace->check_stalled(pending_dcache_.full())) {
DT(3, cycle, "*** lsu-queue-stall: " << state); DT(3, cycle, "*** lsu-queue-stall: " << *trace);
}
if (pending_dcache_.full())
return; return;
// send memory request
bool has_shared_memory = false;
bool mem_rsp_pending = false;
bool is_write = (trace->lsu.type == LsuType::STORE);
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
trace->dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
for (auto mem_addr : trace->mem_addrs.at(t)) {
// check shared memory address
if (SM_ENABLE) {
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
&& (mem_addr < SMEM_BASE_ADDR)) {
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
has_shared_memory = true;
continue;
}
}
bool is_io = (mem_addr >= IO_BASE_ADDR);
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.write = is_write;
mem_req.tag = tag;
mem_req.is_io = is_io;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
// do not wait on writes
mem_rsp_pending = !is_write;
}
} }
// send dcache request // do not wait
state.dcache_latency = SimPlatform::instance().cycles(); if (!mem_rsp_pending) {
auto tag = pending_dcache_.allocate({state, state.tmask}); pending_dcache_.release(tag);
for (uint32_t t = 0; t < num_threads_; ++t) { uint32_t delay = 1;
if (!state.tmask.test(t)) if (has_shared_memory) {
continue; // all threads accessed shared memory
MemReq mem_req; delay += Constants::SMEM_DELAY;
mem_req.addr = state.mem_addrs.at(t); }
mem_req.write = (state.lsu.type == LsuType::STORE); this->schedule_output(trace, delay);
mem_req.tag = tag; }
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state); // remove input
}
inputs_.pop(); inputs_.pop();
} }
@@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) {
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
void AluUnit::step(uint64_t /*cycle*/) { void AluUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state; if (inputs_.empty())
if (!inputs_.try_pop(&state))
return; return;
switch (state.alu.type) { auto trace = inputs_.top();
case AluType::ARITH: switch (trace->alu.type) {
this->schedule_output(state, 1); case AluType::ARITH:
break;
case AluType::BRANCH: case AluType::BRANCH:
this->schedule_output(state, 1); case AluType::CMOV:
this->schedule_output(trace, 1);
inputs_.pop();
break; break;
case AluType::IMUL: case AluType::IMUL:
this->schedule_output(state, LATENCY_IMUL); this->schedule_output(trace, LATENCY_IMUL);
inputs_.pop();
break; break;
case AluType::IDIV: case AluType::IDIV:
this->schedule_output(state, XLEN); this->schedule_output(trace, XLEN);
inputs_.pop();
break; break;
default:
std::abort();
} }
} }
@@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) {
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
void CsrUnit::step(uint64_t /*cycle*/) { void CsrUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state; if (inputs_.empty())
if (!inputs_.try_pop(&state))
return; return;
this->schedule_output(state, 1); auto trace = inputs_.top();
this->schedule_output(trace, 1);
inputs_.pop();
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) {
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
void FpuUnit::step(uint64_t /*cycle*/) { void FpuUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state; if (inputs_.empty())
if (!inputs_.try_pop(&state))
return; return;
switch (state.fpu.type) { auto trace = inputs_.top();
switch (trace->fpu.type) {
case FpuType::FNCP: case FpuType::FNCP:
this->schedule_output(state, 1); this->schedule_output(trace, 1);
inputs_.pop();
break; break;
case FpuType::FMA: case FpuType::FMA:
this->schedule_output(state, LATENCY_FMA); this->schedule_output(trace, LATENCY_FMA);
inputs_.pop();
break; break;
case FpuType::FDIV: case FpuType::FDIV:
this->schedule_output(state, LATENCY_FDIV); this->schedule_output(trace, LATENCY_FDIV);
inputs_.pop();
break; break;
case FpuType::FSQRT: case FpuType::FSQRT:
this->schedule_output(state, LATENCY_FSQRT); this->schedule_output(trace, LATENCY_FSQRT);
inputs_.pop();
break; break;
case FpuType::FCVT: case FpuType::FCVT:
this->schedule_output(state, LATENCY_FCVT); this->schedule_output(trace, LATENCY_FCVT);
inputs_.pop();
break; break;
default:
std::abort();
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} GpuUnit::GpuUnit(Core* core)
: ExeUnit("GPU")
, core_(core)
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::step(uint64_t /*cycle*/) { void GpuUnit::step(uint64_t cycle) {
pipeline_state_t state; __unused (cycle);
if (!inputs_.try_pop(&state)) #ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.top();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
trace->dcache_latency = latency;
this->schedule_output(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
#endif
// check input queue
if (inputs_.empty())
return; return;
switch (state.gpu.type) {
auto trace = inputs_.top();
switch (trace->gpu.type) {
case GpuType::TMC: case GpuType::TMC:
case GpuType::WSPAWN: case GpuType::WSPAWN:
case GpuType::SPLIT: case GpuType::SPLIT:
case GpuType::JOIN: case GpuType::JOIN:
case GpuType::BAR: case GpuType::BAR:
this->schedule_output(state, 1); this->schedule_output(trace, 1);
break; inputs_.pop();
case GpuType::TEX:
/* TODO */
break; break;
case GpuType::TEX: {
if (this->processTexRequest(cycle, trace))
inputs_.pop();
} break;
default:
std::abort();
} }
}
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
__unused (cycle);
// check pending queue capacity
if (!trace->check_stalled(pending_tex_reqs_.full())) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
}
if (pending_tex_reqs_.full())
return false;
// send memory request
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
trace->tex_latency = SimPlatform::instance().cycles();
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
dcache_req_port.send(mem_req, 1);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
}
}
return true;
} }

View File

@@ -11,36 +11,43 @@ class Core;
class ExeUnit { class ExeUnit {
protected: protected:
const char* name_; const char* name_;
Queue<pipeline_state_t> inputs_; Queue<pipeline_trace_t*> inputs_;
Queue<pipeline_state_t> outputs_; Queue<pipeline_trace_t*> outputs_;
void schedule_output(const pipeline_state_t& state, uint32_t delay) { void schedule_output(pipeline_trace_t* trace, uint32_t delay) {
if (delay > 1) { if (delay > 1) {
SimPlatform::instance().schedule( SimPlatform::instance().schedule(
[&](const pipeline_state_t& req) { [&](pipeline_trace_t* req) {
outputs_.push(req); outputs_.push(req);
}, },
state, trace,
(delay - 1) (delay - 1)
); );
} else { } else {
outputs_.push(state); outputs_.push(trace);
} }
} }
public: public:
typedef std::shared_ptr<ExeUnit> Ptr; typedef std::shared_ptr<ExeUnit> Ptr;
ExeUnit(const char* name) : name_(name) {} ExeUnit(const char* name) : name_(name) {}
virtual ~ExeUnit() {} virtual ~ExeUnit() {}
void push_input(const pipeline_state_t& state) { void push(pipeline_trace_t* trace) {
inputs_.push(state); inputs_.push(trace);
} }
bool pop_output(pipeline_state_t* state) { bool empty() const {
return outputs_.try_pop(state); return outputs_.empty();
}
pipeline_trace_t* top() const {
return outputs_.top();
}
void pop() {
outputs_.pop();
} }
virtual void step(uint64_t cycle) = 0; virtual void step(uint64_t cycle) = 0;
@@ -61,8 +68,8 @@ class LsuUnit : public ExeUnit {
private: private:
Core* core_; Core* core_;
uint32_t num_threads_; uint32_t num_threads_;
HashTable<std::pair<pipeline_state_t, ThreadMask>> pending_dcache_; HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
pipeline_state_t fence_state_; pipeline_trace_t* fence_state_;
bool fence_lock_; bool fence_lock_;
public: public:
@@ -101,6 +108,13 @@ public:
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit { class GpuUnit : public ExeUnit {
private:
Core* core_;
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
public: public:
GpuUnit(Core*); GpuUnit(Core*);

View File

@@ -7,7 +7,7 @@ namespace vortex {
class IBuffer { class IBuffer {
private: private:
std::queue<pipeline_state_t> entries_; std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_; uint32_t capacity_;
public: public:
@@ -23,12 +23,12 @@ public:
return (entries_.size() == capacity_); return (entries_.size() == capacity_);
} }
const pipeline_state_t& top() const { pipeline_trace_t* top() const {
return entries_.front(); return entries_.front();
} }
void push(const pipeline_state_t& state) { void push(pipeline_trace_t* trace) {
entries_.emplace(state); entries_.emplace(trace);
} }
void pop() { void pop() {

View File

@@ -29,10 +29,9 @@ enum Opcode {
FMNMADD = 0x4f, FMNMADD = 0x4f,
// Vector Extension // Vector Extension
VSET = 0x57, VSET = 0x57,
VL = 0x7,
VS = 0x27,
// GPGPU Extension // GPGPU Extension
GPGPU = 0x6b, GPGPU = 0x6b,
GPU = 0x5b,
}; };
enum InstType { enum InstType {
@@ -70,6 +69,7 @@ public:
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setFunc2(Word func2) { func2_ = func2; }
void setFunc3(Word func3) { func3_ = func3; } void setFunc3(Word func3) { func3_ = func3; }
void setFunc7(Word func7) { func7_ = func7; } void setFunc7(Word func7) { func7_ = func7; }
void setImm(Word imm) { has_imm_ = true; imm_ = imm; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
@@ -85,6 +85,7 @@ public:
/* Getters used by encoders. */ /* Getters used by encoders. */
Opcode getOpcode() const { return opcode_; } Opcode getOpcode() const { return opcode_; }
Word getFunc2() const { return func2_; }
Word getFunc3() const { return func3_; } Word getFunc3() const { return func3_; }
Word getFunc6() const { return func6_; } Word getFunc6() const { return func6_; }
Word getFunc7() const { return func7_; } Word getFunc7() const { return func7_; }
@@ -118,6 +119,7 @@ private:
RegType rsrc_type_[MAX_REG_SOURCES]; RegType rsrc_type_[MAX_REG_SOURCES];
int rsrc_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES];
int rdest_; int rdest_;
Word func2_;
Word func3_; Word func3_;
Word func6_; Word func6_;

View File

@@ -20,14 +20,16 @@ public:
void step(uint64_t /*cycle*/) { void step(uint64_t /*cycle*/) {
for (uint32_t i = 0, n = num_banks_; i < n; ++i) { for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
MemReq mem_req; auto& mem_req_port = simobject_->MemReqPorts.at(i);
if (!simobject_->MemReqPorts.at(i).read(&mem_req)) if (mem_req_port.empty())
continue; continue;
auto& mem_req = mem_req_port.top();
if (!mem_req.write) { if (!mem_req.write) {
MemRsp mem_rsp; MemRsp mem_rsp;
mem_rsp.tag = mem_req.tag; mem_rsp.tag = mem_req.tag;
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
} }
mem_req_port.pop();
} }
} }
}; };

View File

@@ -10,10 +10,22 @@ struct MemReq {
uint64_t addr; uint64_t addr;
uint32_t tag; uint32_t tag;
bool write; bool write;
bool is_io;
MemReq(uint64_t _addr = 0,
uint64_t _tag = 0,
bool _write = false,
bool _is_io = false
) : addr(_addr)
, tag(_tag)
, write(_write)
, is_io(_is_io)
{}
}; };
struct MemRsp { struct MemRsp {
uint32_t tag; uint64_t tag;
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
}; };
class MemSim : public SimObject<MemSim>{ class MemSim : public SimObject<MemSim>{

View File

@@ -5,11 +5,12 @@
#include <iostream> #include <iostream>
#include <util.h> #include <util.h>
#include "types.h" #include "types.h"
#include "archdef.h"
#include "debug.h" #include "debug.h"
namespace vortex { namespace vortex {
struct pipeline_state_t { struct pipeline_trace_t {
//-- //--
uint64_t id; uint64_t id;
@@ -20,17 +21,24 @@ struct pipeline_state_t {
Word PC; Word PC;
//-- //--
bool stall_warp; bool fetch_stall;
bool pipeline_stall;
//--
bool wb; bool wb;
RegType rdest_type; RegType rdest_type;
int rdest; int rdest;
//--
RegMask used_iregs; RegMask used_iregs;
RegMask used_fregs; RegMask used_fregs;
RegMask used_vregs; RegMask used_vregs;
//- //-
ExeType exe_type; ExeType exe_type;
std::vector<uint64_t> mem_addrs;
//--
std::vector<std::vector<uint64_t>> mem_addrs;
//-- //--
union { union {
@@ -51,27 +59,37 @@ struct pipeline_state_t {
// stats // stats
uint64_t icache_latency; uint64_t icache_latency;
uint64_t dcache_latency; uint64_t dcache_latency;
uint64_t tex_latency;
void clear() { pipeline_trace_t(uint64_t id_, const ArchDef& arch) {
id = id_;
cid = 0; cid = 0;
wid = 0; wid = 0;
tmask.reset(); tmask.reset();
PC = 0; PC = 0;
stall_warp = false; fetch_stall = false;
wb = false; pipeline_stall = false;
wb = false;
rdest = 0; rdest = 0;
rdest_type = RegType::None; rdest_type = RegType::None;
used_iregs.reset(); used_iregs.reset();
used_fregs.reset(); used_fregs.reset();
used_vregs.reset(); used_vregs.reset();
exe_type = ExeType::NOP; exe_type = ExeType::NOP;
mem_addrs.clear(); mem_addrs.resize(arch.num_threads());
icache_latency = 0; icache_latency = 0;
dcache_latency = 0; dcache_latency = 0;
tex_latency = 0;
}
bool check_stalled(bool stall) {
bool old = pipeline_stall;
pipeline_stall = stall;
return stall ? old : true;
} }
}; };
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << ", wb=" << state.wb; os << ", wb=" << state.wb;
if (state.wb) { if (state.wb) {
@@ -82,10 +100,9 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state)
return os; return os;
} }
class PipelineStage : public Queue<pipeline_state_t> { class PipelineStage : public Queue<pipeline_trace_t*> {
protected: protected:
const char* name_; const char* name_;
friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&);
public: public:
PipelineStage(const char* name = nullptr) PipelineStage(const char* name = nullptr)

View File

@@ -33,7 +33,8 @@ Processor::Processor(const ArchDef& arch)
L3_NUM_BANKS, // number of banks L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size NUM_CLUSTERS, // request size
true, // write-throught true, // write-through
false, // write response
0, // victim size 0, // victim size
L3_MSHR_SIZE, // mshr L3_MSHR_SIZE, // mshr
2, // pipeline latency 2, // pipeline latency
@@ -74,7 +75,8 @@ Processor::Processor(const ArchDef& arch)
L2_NUM_BANKS, // number of banks L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports L2_NUM_PORTS, // number of ports
NUM_CORES, // request size NUM_CORES, // request size
true, // write-throught true, // write-through
false, // write response
0, // victim size 0, // victim size
L2_MSHR_SIZE, // mshr L2_MSHR_SIZE, // mshr
2, // pipeline latency 2, // pipeline latency
@@ -129,7 +131,7 @@ int Processor::run() {
if (core->running()) { if (core->running()) {
running = true; running = true;
} }
if (core->check_ebreak()) { if (core->check_exit()) {
exitcode = core->getIRegValue(3); exitcode = core->getIRegValue(3);
running = false; running = false;
break; break;
@@ -137,5 +139,7 @@ int Processor::run() {
} }
} while (running); } while (running);
std::cout << std::flush;
return exitcode; return exitcode;
} }

View File

@@ -7,6 +7,12 @@ namespace vortex {
class Scoreboard { class Scoreboard {
private: private:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
};
std::vector<RegMask> in_use_iregs_; std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_; std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_; std::vector<RegMask> in_use_vregs_;
@@ -25,21 +31,21 @@ public:
} }
} }
bool in_use(const pipeline_state_t& state) const { bool in_use(pipeline_trace_t* state) const {
return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|| (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|| (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
} }
std::vector<uint64_t> owners(const pipeline_state_t& state) const { std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
std::vector<uint64_t> out; std::vector<reg_use_t> out;
{ {
uint32_t r = 0; uint32_t r = 0;
auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid); auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
while (used_iregs.any()) { while (used_iregs.any()) {
if (used_iregs.test(0)) { if (used_iregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer; uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
out.push_back(owners_.at(tag)); out.push_back({RegType::Integer, r, owners_.at(tag)});
} }
used_iregs >>= 1; used_iregs >>= 1;
++r; ++r;
@@ -47,11 +53,11 @@ public:
} }
{ {
uint32_t r = 0; uint32_t r = 0;
auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid); auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
while (used_fregs.any()) { while (used_fregs.any()) {
if (used_fregs.test(0)) { if (used_fregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float; uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
out.push_back(owners_.at(tag)); out.push_back({RegType::Float, r, owners_.at(tag)});
} }
used_fregs >>= 1; used_fregs >>= 1;
++r; ++r;
@@ -59,11 +65,11 @@ public:
} }
{ {
uint32_t r = 0; uint32_t r = 0;
auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid); auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
while (used_vregs.any()) { while (used_vregs.any()) {
if (used_vregs.test(0)) { if (used_vregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector; uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
out.push_back(owners_.at(tag)); out.push_back({RegType::Vector, r, owners_.at(tag)});
} }
used_vregs >>= 1; used_vregs >>= 1;
++r; ++r;
@@ -72,44 +78,44 @@ public:
return std::move(out); return std::move(out);
} }
void reserve(const pipeline_state_t& state) { void reserve(pipeline_trace_t* state) {
if (!state.wb) if (!state->wb)
return; return;
switch (state.rdest_type) { switch (state->rdest_type) {
case RegType::Integer: case RegType::Integer:
in_use_iregs_.at(state.wid).set(state.rdest); in_use_iregs_.at(state->wid).set(state->rdest);
break; break;
case RegType::Float: case RegType::Float:
in_use_fregs_.at(state.wid).set(state.rdest); in_use_fregs_.at(state->wid).set(state->rdest);
break; break;
case RegType::Vector: case RegType::Vector:
in_use_vregs_.at(state.wid).set(state.rdest); in_use_vregs_.at(state->wid).set(state->rdest);
break; break;
default: default:
break; break;
} }
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
assert(owners_.count(tag) == 0); assert(owners_.count(tag) == 0);
owners_[tag] = state.id; owners_[tag] = state->id;
} }
void release(const pipeline_state_t& state) { void release(pipeline_trace_t* state) {
if (!state.wb) if (!state->wb)
return; return;
switch (state.rdest_type) { switch (state->rdest_type) {
case RegType::Integer: case RegType::Integer:
in_use_iregs_.at(state.wid).reset(state.rdest); in_use_iregs_.at(state->wid).reset(state->rdest);
break; break;
case RegType::Float: case RegType::Float:
in_use_fregs_.at(state.wid).reset(state.rdest); in_use_fregs_.at(state->wid).reset(state->rdest);
break; break;
case RegType::Vector: case RegType::Vector:
in_use_vregs_.at(state.wid).reset(state.rdest); in_use_vregs_.at(state->wid).reset(state->rdest);
break; break;
default: default:
break; break;
} }
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
owners_.erase(tag); owners_.erase(tag);
} }
}; };

91
sim/simX/tex_unit.cpp Normal file
View File

@@ -0,0 +1,91 @@
#include "tex_unit.h"
#include "core.h"
#include <texturing.h>
#include <VX_config.h>
using namespace vortex;
enum class FilterMode {
Point,
Bilinear,
Trilinear,
};
TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}
void TexUnit::set_state(uint32_t state, uint32_t value) {
states_.at(state) = value;
}
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<uint64_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
auto stride = Stride(format);
switch (filter) {
case FilterMode::Bilinear: {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
uint32_t addr00 = base_addr + offset00 * stride;
uint32_t addr01 = base_addr + offset01 * stride;
uint32_t addr10 = base_addr + offset10 * stride;
uint32_t addr11 = base_addr + offset11 * stride;
// memory lookup
uint32_t texel00 = core_->dcache_read(addr00, stride);
uint32_t texel01 = core_->dcache_read(addr01, stride);
uint32_t texel10 = core_->dcache_read(addr10, stride);
uint32_t texel11 = core_->dcache_read(addr11, stride);
mem_addrs->push_back(addr00);
mem_addrs->push_back(addr01);
mem_addrs->push_back(addr10);
mem_addrs->push_back(addr11);
// filtering
auto color = TexFilterLinear(
format, texel00, texel01, texel10, texel11, alpha, beta);
return color;
}
case FilterMode::Point: {
// addressing
uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint32_t addr = base_addr + offset * stride;
// memory lookup
uint32_t texel = core_->dcache_read(addr, stride);
mem_addrs->push_back(addr);
// filtering
auto color = TexFilterPoint(format, texel);
return color;
}
default:
std::abort();
return 0;
}
}

26
sim/simX/tex_unit.h Normal file
View File

@@ -0,0 +1,26 @@
#pragma once
#include "types.h"
namespace vortex {
class Core;
class TexUnit {
public:
TexUnit(Core* core);
~TexUnit();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<uint64_t>* mem_addrs);
private:
std::array<uint32_t, NUM_TEX_STATES> states_;
Core* core_;
};
}

View File

@@ -66,6 +66,7 @@ enum class AluType {
BRANCH, BRANCH,
IMUL, IMUL,
IDIV, IDIV,
CMOV,
}; };
inline std::ostream &operator<<(std::ostream &os, const AluType& type) { inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -74,6 +75,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::BRANCH: os << "BRANCH"; break; case AluType::BRANCH: os << "BRANCH"; break;
case AluType::IMUL: os << "IMUL"; break; case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break; case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
} }
return os; return os;
} }
@@ -155,8 +157,6 @@ class Queue {
protected: protected:
std::queue<T> queue_; std::queue<T> queue_;
uint32_t count;
public: public:
Queue() {} Queue() {}
@@ -168,21 +168,16 @@ public:
return queue_.front(); return queue_.front();
} }
void push(const T& value) { T& top() {
++count; return queue_.front();
queue_.push(value);
} }
void pop() { void pop() {
queue_.pop(); queue_.pop();
} }
bool try_pop(T* value) { void push(const T& value) {
if (queue_.empty()) queue_.push(value);
return false;
*value = queue_.front();
queue_.pop();
return true;
} }
}; };
@@ -244,14 +239,6 @@ public:
entry.first = false; entry.first = false;
--capacity_; --capacity_;
} }
void remove(uint32_t index, T* value) {
auto& entry = entries_.at(index);
assert(entry.first);
*value = entry.second;
entry.first = false;
--capacity_;
}
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -259,18 +246,7 @@ public:
template <typename Req, typename Rsp, uint32_t MaxInputs = 32> template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
class Switch : public SimObject<Switch<Req, Rsp>> { class Switch : public SimObject<Switch<Req, Rsp>> {
private: private:
struct req_batch_t {
std::vector<Req> data;
std::bitset<MaxInputs> valid;
req_batch_t() {}
req_batch_t(uint32_t size)
: data(size)
, valid(0)
{}
};
ArbiterType type_; ArbiterType type_;
std::queue<req_batch_t> reqq_;
uint32_t delay_; uint32_t delay_;
uint32_t cursor_; uint32_t cursor_;
uint32_t tag_shift_; uint32_t tag_shift_;
@@ -295,55 +271,43 @@ public:
{ {
assert(delay_ != 0); assert(delay_ != 0);
assert(num_inputs <= MaxInputs); assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
}
} }
void step(uint64_t /*cycle*/) { void step(uint64_t /*cycle*/) {
// process incomming requests if (ReqIn.size() == 1)
{ return;
req_batch_t req_batch(ReqIn.size());
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { // process incomming requests
Req req; for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
if (ReqIn.at(i).read(&req)) { uint32_t j = (cursor_ + i) % n;
req_batch.data.at(i) = req; auto& req_in = ReqIn.at(j);
req_batch.valid.set(i); if (!req_in.empty()) {
auto& req = req_in.top();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
} }
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
} }
if (req_batch.valid.any()) {
reqq_.push(req_batch);
}
}
// apply arbitration
if (!reqq_.empty()) {
auto& req_batch = reqq_.front();
for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
auto j = (cursor_ + i) % n;
if (req_batch.valid.test(j)) {
auto& req = req_batch.data.at(j);
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
}
ReqOut.send(req, delay_);
req_batch.valid.reset(j);
this->update_cursor(j);
if (!req_batch.valid.any())
reqq_.pop(); // pop when empty
break;
}
}
} }
// process incoming reponses // process incoming reponses
{ if (!RspIn.empty()) {
Rsp rsp; auto& rsp = RspIn.top();
if (RspIn.read(&rsp)) { uint32_t port_id = 0;
uint32_t port_id = 0; if (tag_shift_) {
if (tag_shift_) { port_id = rsp.tag & ((1 << tag_shift_)-1);
port_id = rsp.tag & ((1 << tag_shift_)-1); rsp.tag >>= tag_shift_;
rsp.tag >>= tag_shift_; }
} RspOut.at(port_id).send(rsp, 1);
RspOut.at(port_id).send(rsp, 1); RspIn.pop();
}
} }
} }

View File

@@ -21,7 +21,7 @@ Warp::Warp(Core *core, Word id)
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0)); vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
} }
void Warp::eval(pipeline_state_t *pipeline_state) { void Warp::eval(pipeline_trace_t *trace) {
assert(tmask_.any()); assert(tmask_.any());
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
@@ -38,18 +38,18 @@ void Warp::eval(pipeline_state_t *pipeline_state) {
std::abort(); std::abort();
} }
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")");
// Update state // Update trace
pipeline_state->cid = core_->id(); trace->cid = core_->id();
pipeline_state->wid = id_; trace->wid = id_;
pipeline_state->PC = PC_; trace->PC = PC_;
pipeline_state->tmask = tmask_; trace->tmask = tmask_;
pipeline_state->rdest = instr->getRDest(); trace->rdest = instr->getRDest();
pipeline_state->rdest_type = instr->getRDType(); trace->rdest_type = instr->getRDType();
// Execute // Execute
this->execute(*instr, pipeline_state); this->execute(*instr, trace);
DP(4, "Register state:"); DP(4, "Register state:");
for (int i = 0; i < core_->arch().num_regs(); ++i) { for (int i = 0; i < core_->arch().num_regs(); ++i) {

View File

@@ -9,7 +9,7 @@ namespace vortex {
class Core; class Core;
class Instr; class Instr;
class pipeline_state_t; class pipeline_trace_t;
struct DomStackEntry { struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC) DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask) : tmask(tmask)
@@ -83,11 +83,11 @@ public:
return iRegFile_.at(0).at(reg); return iRegFile_.at(0).at(reg);
} }
void eval(pipeline_state_t *); void eval(pipeline_trace_t *);
private: private:
void execute(const Instr &instr, pipeline_state_t *pipeline_state); void execute(const Instr &instr, pipeline_trace_t *trace);
Word id_; Word id_;
Core *core_; Core *core_;

View File

@@ -24,7 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
@@ -51,10 +50,13 @@ CXXFLAGS += $(CONFIGS)
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS) #VL_FLAGS += --threads $(THREADS)
# Enable VCD trace
#VCD_TRACE = -DVCD_OUTPUT
# Debugigng # Debugigng
ifdef DEBUG ifdef DEBUG
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
else else
VL_FLAGS += -DNDEBUG VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG CXXFLAGS += -O2 -DNDEBUG

View File

@@ -9,8 +9,8 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
@@ -21,7 +21,7 @@ CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors
CXXFLAGS += -DLUPNG_USE_ZLIB CXXFLAGS += -DLUPNG_USE_ZLIB
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz
@@ -38,7 +38,7 @@ kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin $(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS) kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS) $(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@

View File

@@ -1,25 +1,27 @@
#ifndef _COMMON_H_ #ifndef _COMMON_H_
#define _COMMON_H_ #define _COMMON_H_
#include <VX_config.h>
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct { typedef struct {
uint32_t num_tasks; bool use_sw;
uint8_t format; uint32_t num_tasks;
uint8_t filter; uint8_t format;
uint8_t wrap; uint8_t filter;
uint8_t use_sw; uint8_t wrapu;
uint32_t lod; uint8_t wrapv;
uint8_t src_logWidth; uint8_t src_logwidth;
uint8_t src_logHeight; uint8_t src_logheight;
uint8_t src_stride; uint32_t src_addr;
uint8_t src_pitch; float lod;
uint32_t src_ptr; uint32_t mip_offs[TEX_LOD_MAX+1];
uint32_t dst_width; uint32_t dst_width;
uint32_t dst_height; uint32_t dst_height;
uint8_t dst_stride; uint8_t dst_stride;
uint32_t dst_pitch; uint32_t dst_pitch;
uint32_t dst_ptr; uint32_t dst_addr;
} kernel_arg_t; } kernel_arg_t;
#endif #endif

View File

@@ -1,11 +1,9 @@
#include <stdint.h> #include <stdint.h>
#include <vx_intrinsics.h> #include <vx_intrinsics.h>
#include <vx_spawn.h> #include <vx_spawn.h>
#include "common.h" #include <vx_print.h>
#include "texsw.h" #include "texsw.h"
#define ENABLE_SW
typedef struct { typedef struct {
kernel_arg_t* state; kernel_arg_t* state;
uint32_t tile_width; uint32_t tile_width;
@@ -14,29 +12,50 @@ typedef struct {
float deltaY; float deltaY;
} tile_arg_t; } tile_arg_t;
template <typename T, T Start, T End>
struct static_for_t {
template <typename Fn>
inline void operator()(const Fn& callback) const {
callback(Start);
static_for_t<T, Start+1, End>()(callback);
}
};
template <typename T, T N>
struct static_for_t<T, N, N> {
template <typename Fn>
inline void operator()(const Fn& callback) const {}
};
void kernel_body(int task_id, tile_arg_t* arg) { void kernel_body(int task_id, tile_arg_t* arg) {
kernel_arg_t* state = arg->state; kernel_arg_t* state = arg->state;
uint32_t xoffset = 0; uint32_t xoffset = 0;
uint32_t yoffset = task_id * arg->tile_height; uint32_t yoffset = task_id * arg->tile_height;
uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
float fv = yoffset * arg->deltaY; uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
Fixed<16> xlod(state->lod);
/*vx_printf("task_id=%d, deltaX=%f, deltaY=%f, tile_width=%d, tile_height=%d\n",
task_id, arg->deltaX, arg->deltaY, arg->tile_width, arg->tile_height);*/
float fv = (yoffset + 0.5f) * arg->deltaY;
for (uint32_t y = 0; y < arg->tile_height; ++y) { for (uint32_t y = 0; y < arg->tile_height; ++y) {
uint32_t* dst_row = (uint32_t*)dst_ptr; uint32_t* dst_row = (uint32_t*)dst_ptr;
float fu = xoffset * arg->deltaX; float fu = (xoffset + 0.5f) * arg->deltaX;
for (uint32_t x = 0; x < arg->tile_width; ++x) { for (uint32_t x = 0; x < arg->tile_width; ++x) {
int32_t u = (int32_t)(fu * (1<<20)); Fixed<TEX_FXD_FRAC> xu(fu);
int32_t v = (int32_t)(fv * (1<<20)); Fixed<TEX_FXD_FRAC> xv(fv);
uint32_t color;
#ifdef ENABLE_SW #ifdef ENABLE_SW
if (state->use_sw) { if (state->use_sw)
dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); color = tex_load_sw(state, xu, xv, xlod);
} else { else
#endif
dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod);
#ifdef ENABLE_SW
}
#endif #endif
color = tex_load_hw(state, xu, xv, xlod);
//vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color);
dst_row[x] = color;
fu += arg->deltaX; fu += arg->deltaX;
} }
dst_ptr += state->dst_pitch; dst_ptr += state->dst_pitch;
@@ -48,13 +67,16 @@ int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
// configure texture unit // configure texture unit
vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth);
vx_csr_write(CSR_TEX_MIPOFF(0), 0); csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight);
vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format);
vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu);
vx_csr_write(CSR_TEX_FORMAT(0), arg->format); csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv);
vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0));
vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr);
static_for_t<int, 0, TEX_LOD_MAX+1>()([&](int i) {
csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]);
});
tile_arg_t targ; tile_arg_t targ;
targ.state = arg; targ.state = arg;
@@ -64,4 +86,9 @@ int main() {
targ.deltaY = 1.0f / arg->dst_height; targ.deltaY = 1.0f / arg->dst_height;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ);
/*for (uint32_t t=0; t < arg->num_tasks; ++t) {
kernel_body(t, &targ);
}*/
return 0;
} }

View File

@@ -25,10 +25,11 @@ const char* kernel_file = "kernel.bin";
const char* input_file = "palette64.png"; const char* input_file = "palette64.png";
const char* output_file = "output.png"; const char* output_file = "output.png";
int wrap = 0; int wrap = 0;
int filter = 0; int filter = 0; // 0-> point, 1->bilinear, 2->trilinear
float scale = 1.0f; float scale = 1.0f;
int format = 0; int format = 0;
bool use_sw = false; bool use_sw = false;
float lod = 1.0f; // >= 1.0f
ePixelFormat eformat = FORMAT_A8R8G8B8; ePixelFormat eformat = FORMAT_A8R8G8B8;
vx_device_h device = nullptr; vx_device_h device = nullptr;
@@ -36,7 +37,7 @@ vx_buffer_h buffer = nullptr;
static void show_usage() { static void show_usage() {
std::cout << "Vortex Texture Test." << std::endl; std::cout << "Vortex Texture Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-l lod] [-z no_hw] [-h: help]" << std::endl;
} }
static void parse_args(int argc, char **argv) { static void parse_args(int argc, char **argv) {
@@ -55,6 +56,9 @@ static void parse_args(int argc, char **argv) {
case 'w': case 'w':
wrap = std::atoi(optarg); wrap = std::atoi(optarg);
break; break;
case 'l':
lod = std::stof(optarg, NULL);
break;
case 'z': case 'z':
use_sw = true; use_sw = true;
break; break;
@@ -118,7 +122,7 @@ int run_test(const kernel_arg_t& kernel_arg,
// download destination buffer // download destination buffer
std::cout << "download destination buffer" << std::endl; std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0));
std::vector<uint8_t> dst_pixels(buf_size); std::vector<uint8_t> dst_pixels(buf_size);
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
@@ -137,25 +141,39 @@ int run_test(const kernel_arg_t& kernel_arg,
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
kernel_arg_t kernel_arg; kernel_arg_t kernel_arg;
std::vector<uint8_t> src_pixels; std::vector<uint8_t> src_pixels;
std::vector<uint32_t> mip_offsets;
uint32_t src_width; uint32_t src_width;
uint32_t src_height; uint32_t src_height;
// parse command arguments // parse command arguments
parse_args(argc, argv); parse_args(argc, argv);
RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height)); {
std::vector<uint8_t> staging;
RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height));
RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height));
//uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel;
//dump_image(src_pixels, src_pixels.size() / src_bpp, 1, src_bpp);
}
// check power of two support // check power of two support
if (!ISPOW2(src_width) || !ISPOW2(src_height)) { if (!ispow2(src_width) || !ispow2(src_height)) {
std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
return -1; return -1;
} }
uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; uint32_t src_logwidth = log2ceil(src_width);
uint32_t src_logheight = log2ceil(src_height);
//dump_image(src_pixels, src_width, src_height, src_bpp);
uint32_t src_bufsize = src_bpp * src_width * src_height; uint32_t src_max_lod = std::max(src_logwidth, src_logheight);
if (lod > src_max_lod) {
std::cout << "Error: out-of-bound level-of-detail: lod=" << lod << ", source image=" << src_max_lod << std::endl;
return -1;
}
uint32_t src_bufsize = src_pixels.size();
uint32_t dst_width = (uint32_t)(src_width * scale); uint32_t dst_width = (uint32_t)(src_width * scale);
uint32_t dst_height = (uint32_t)(src_height * scale); uint32_t dst_height = (uint32_t)(src_height * scale);
@@ -183,7 +201,7 @@ int main(int argc, char *argv[]) {
// allocate device memory // allocate device memory
std::cout << "allocate device memory" << std::endl; std::cout << "allocate device memory" << std::endl;
size_t src_addr, dst_addr; uint64_t src_addr, dst_addr;
RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr));
RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr));
@@ -192,32 +210,37 @@ int main(int argc, char *argv[]) {
// allocate staging shared memory // allocate staging shared memory
std::cout << "allocate shared memory" << std::endl; std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t), std::max<uint32_t>(src_bufsize, dst_bufsize)); uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t),
std::max<uint32_t>(src_bufsize, dst_bufsize));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
// upload kernel argument // upload kernel argument
std::cout << "upload kernel argument" << std::endl; std::cout << "upload kernel argument" << std::endl;
{ {
kernel_arg.use_sw = use_sw;
kernel_arg.num_tasks = std::min<uint32_t>(num_tasks, dst_height); kernel_arg.num_tasks = std::min<uint32_t>(num_tasks, dst_height);
kernel_arg.format = format; kernel_arg.format = format;
kernel_arg.filter = filter; kernel_arg.filter = filter;
kernel_arg.wrap = wrap; kernel_arg.wrapu = wrap;
kernel_arg.use_sw = use_sw; kernel_arg.wrapv = wrap;
kernel_arg.lod = 0x0;
kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); kernel_arg.src_logwidth = src_logwidth;
kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); kernel_arg.src_logheight = src_logheight;
kernel_arg.src_stride = src_bpp; kernel_arg.src_addr = src_addr;
kernel_arg.src_pitch = src_bpp * src_width; kernel_arg.lod = lod;
kernel_arg.src_ptr = src_addr;
for (uint32_t i = 0; i < mip_offsets.size(); ++i) {
assert(i < TEX_LOD_MAX);
kernel_arg.mip_offs[i] = mip_offsets.at(i);
}
kernel_arg.dst_width = dst_width; kernel_arg.dst_width = dst_width;
kernel_arg.dst_height = dst_height; kernel_arg.dst_height = dst_height;
kernel_arg.dst_stride = dst_bpp; kernel_arg.dst_stride = dst_bpp;
kernel_arg.dst_pitch = dst_bpp * dst_width; kernel_arg.dst_pitch = dst_bpp * dst_width;
kernel_arg.dst_ptr = dst_addr; kernel_arg.dst_addr = dst_addr;
auto buf_ptr = (int*)vx_host_ptr(buffer); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
} }
@@ -225,21 +248,21 @@ int main(int argc, char *argv[]) {
// upload source buffer // upload source buffer
std::cout << "upload source buffer" << std::endl; std::cout << "upload source buffer" << std::endl;
{ {
auto buf_ptr = (int8_t*)vx_host_ptr(buffer); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < src_bufsize; ++i) { for (uint32_t i = 0; i < src_bufsize; ++i) {
buf_ptr[i] = src_pixels[i]; buf_ptr[i] = src_pixels[i];
} }
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0));
} }
// clear destination buffer // clear destination buffer
std::cout << "clear destination buffer" << std::endl; std::cout << "clear destination buffer" << std::endl;
{ {
auto buf_ptr = (int32_t*)vx_host_ptr(buffer); auto buf_ptr = (uint32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { for (uint32_t i = 0; i < (dst_bufsize/4); ++i) {
buf_ptr[i] = 0xdeadbeef; buf_ptr[i] = 0xdeadbeef;
} }
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0));
} }
// run tests // run tests

View File

@@ -1,167 +1,122 @@
#ifndef _TEXSW_H_ #pragma once
#include <vx_intrinsics.h>
#include <texturing.h>
#include "common.h" #include "common.h"
#define TEX_LOD_MAX 11 inline uint32_t texel_read(uint8_t* address, uint32_t stride) {
switch (stride) {
#define MIN(x, y) ((x < y) ? (x) : (y)) case 1: return *(uint8_t*)address;
case 2: return *(uint16_t*)address;
#define MAX(x, y) ((x > y) ? (x) : (y)) case 4: return *(uint32_t*)address;
default:
inline int address(int wrap, int value) { std::abort();
switch (wrap) { return 0;
case 1: return value & 0xfffff;
default:
case 0: return MIN(MAX(value, 0), 0xfffff);
} }
} }
inline void unpack(int format, int value, int* l, int* h) { inline uint32_t vx_tex_sw(kernel_arg_t* state,
switch (format) { Fixed<TEX_FXD_FRAC> xu,
case 1: Fixed<TEX_FXD_FRAC> xv,
case 2: uint32_t lod) {
*l = value; uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod];
*h = 0; uint32_t log_width = std::max<int32_t>(state->src_logwidth - lod, 0);
break; uint32_t log_height = std::max<int32_t>(state->src_logheight - lod, 0);
case 3: auto format = (TexFormat)state->format;
*l = (value | (value << 8)) & 0x00ff00ff; auto wrapu = (WrapMode)state->wrapu;
*h = 0; auto wrapv = (WrapMode)state->wrapv;
break; auto filter = state->filter;
case 4: auto stride = Stride(format);
*l = (value | (value << 16)) & 0x07e0f81f;
*h = 0;
break;
case 5:
*l = (value | (value << 12)) & 0x0f0f0f0f;
*h = 0;
break;
default:
case 0:
*l = value & 0x00ff00ff;
*h = (value >> 8) & 0x00ff00ff;
break;
}
}
inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { uint32_t color;
*l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
*h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
}
inline int pack(int format, int l, int h) {
switch (format) {
case 1:
case 2:
return l;
case 3:
return (l | (l >> 8)) & 0xffff;
case 4:
return (l | (l >> 16)) & 0xffff;
case 5:
return (l | (l >> 12)) & 0xffff;
default:
case 0:
return (h << 8) | l;
}
}
inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) {
int base_addr = state->src_ptr;
int mip_offset = 0;
int log_width = state->src_logWidth;
int log_height = state->src_logHeight;
int format = state->format;
int wrap = state->wrap;
int filter = state->filter;
int32_t* pBits = ((uint32_t*)base_addr) + mip_offset;
if (filter) { if (filter) {
int u0 = address(wrap, u - (0x80000 >> log_width)); // addressing
int v0 = address(wrap, v - (0x80000 >> log_height)); uint32_t offset00, offset01, offset10, offset11;
int u1 = address(wrap, u + (0x80000 >> log_width)); uint32_t alpha, beta;
int v1 = address(wrap, v + (0x80000 >> log_height)); TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
int x0 = u0 >> (20 - log_width); uint8_t* addr00 = base_addr + offset00 * stride;
int y0 = v0 >> (20 - log_height); uint8_t* addr01 = base_addr + offset01 * stride;
int x1 = u1 >> (20 - log_width); uint8_t* addr10 = base_addr + offset10 * stride;
int y1 = v1 >> (20 - log_height); uint8_t* addr11 = base_addr + offset11 * stride;
// memory lookup // memory lookup
uint32_t texel00 = texel_read(addr00, stride);
int c0 = pBits[x0 + (y0 << log_width)]; uint32_t texel01 = texel_read(addr01, stride);
int c1 = pBits[x1 + (y0 << log_width)]; uint32_t texel10 = texel_read(addr10, stride);
int c2 = pBits[x0 + (y1 << log_width)]; uint32_t texel11 = texel_read(addr11, stride);
int c3 = pBits[x1 + (y1 << log_width)];
// filtering // filtering
color = TexFilterLinear(
int alpha = x0 & 0xff; format, texel00, texel01, texel10, texel11, alpha, beta);
int beta = y0 & 0xff;
int c0a, c0b;
int c1a, c1b;
int c01a, c01b;
unpack(format, c0, &c0a, &c0b);
unpack(format, c1, &c1a, &c1b);
lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b);
int c2a, c2b;
int c3a, c3b;
int c23a, c23b;
unpack(format, c2, &c2a, &c2b);
unpack(format, c3, &c3a, &c3b);
lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b);
int c4a, c4b;
lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b);
return pack(format, c4a, c4b);
} else { } else {
int u0 = address(wrap, u); // addressing
int v0 = address(wrap, v); uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint8_t* addr = base_addr + offset * stride;
// memory lookup
uint32_t texel = texel_read(addr, stride);
int x0 = u0 >> (20 - log_width); // filtering
int y0 = v0 >> (20 - log_height); color = TexFilterPoint(format, texel);
int c0 = pBits[x0 + (y0 <<log_width)];
int c0a, c0b;
unpack(format, c0, &c0a, &c0b);
return pack(format, c0a, c0b);
} }
return color;
} }
inline int vx_tex3(int stage, int u, int v, int lod) { inline uint32_t tex_load_hw(kernel_arg_t* state,
int lodn = MIN(lod + 0x100000, TEX_LOD_MAX); Fixed<TEX_FXD_FRAC> xu,
int a = vx_tex(0, u, v, lod); Fixed<TEX_FXD_FRAC> xv,
int b = vx_tex(0, u, v, lodn); Fixed<16> xlod) {
int al = a & 0x00ff00ff; uint32_t color;
int ah = (a >> 8) & 0x00ff00ff; int32_t ilod = std::max<int32_t>(xlod.data(), Fixed<16>::ONE);
int bl = b & 0x00ff00ff; uint32_t lod = std::min<uint32_t>(log2floor(ilod) - 16, TEX_LOD_MAX);
int bh = (b >> 8) & 0x00ff00ff; if (state->filter == 2) {
int frac = (lod >> 12) & 0xff; uint32_t lod_n = std::min<uint32_t>(lod + 1, TEX_LOD_MAX);
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; uint32_t frac = ilod >> (lod + 16 - 8);
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; uint32_t texel0 = vx_tex(0, xu.data(), xv.data(), lod);
int c = al | (ah << 8); uint32_t texel1 = vx_tex(0, xu.data(), xv.data(), lod_n);
return c; uint32_t cl, ch;
{
uint32_t c0l, c0h;
uint32_t c1l, c1h;
Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h);
Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h);
Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch);
}
color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
} else {
color = vx_tex(0, xu.data(), xv.data(), lod);
}
return color;
} }
inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { inline uint32_t tex_load_sw(kernel_arg_t* state,
int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); Fixed<TEX_FXD_FRAC> xu,
int a = tex_sw(state, 0, u, v, lod); Fixed<TEX_FXD_FRAC> xv,
int b = tex_sw(state, 0, u, v, lodn); Fixed<16> xlod) {
int al = a & 0x00ff00ff; uint32_t color;
int ah = (a >> 8) & 0x00ff00ff; int32_t ilod = std::max<int32_t>(xlod.data(), Fixed<16>::ONE);
uint32_t lod = std::min<uint32_t>(log2floor(ilod) - 16, TEX_LOD_MAX);
int bl = b & 0x00ff00ff; if (state->filter == 2) {
int bh = (b >> 8) & 0x00ff00ff; uint32_t lod_n = std::min<uint32_t>(lod + 1, TEX_LOD_MAX);
int frac = (lod >> 12) & 0xff; uint32_t frac = ilod >> (lod + 16 - 8);
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; uint32_t texel0 = vx_tex_sw(state, xu, xv, lod);
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; uint32_t texel1 = vx_tex_sw(state, xu, xv, lod_n);
int c = al | (ah << 8); uint32_t cl, ch;
return c; {
} uint32_t c0l, c0h;
uint32_t c1l, c1h;
#endif Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h);
Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h);
Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch);
}
color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
} else {
color = vx_tex_sw(state, xu, xv, lod);
}
return color;
}

View File

@@ -191,4 +191,112 @@ int ConvertImage(std::vector<uint8_t>& dst_pixels,
SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch};
return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0);
}
int GenerateMipmaps(std::vector<uint8_t>& dst_pixels,
std::vector<uint32_t>& mip_offsets,
const std::vector<uint8_t>& src_pixels,
ePixelFormat format,
uint32_t src_width,
uint32_t src_height) {
std::vector<uint8_t> src_staging, dst_staging;
const std::vector<uint8_t> *pSrcPixels;
std::vector<uint8_t> *pDstPixels;
// convert source image if needed
bool need_conversion = (format != FORMAT_A8R8G8B8);
if (need_conversion) {
ConvertImage(src_staging, src_pixels, src_width, src_height, format, FORMAT_A8R8G8B8);
pSrcPixels = &src_staging;
pDstPixels = &dst_staging;
} else {
pSrcPixels = &src_pixels;
pDstPixels = &dst_pixels;
}
uint32_t src_logwidth = log2ceil(src_width);
uint32_t src_logheight = log2ceil(src_height);
uint32_t max_lod = std::max(src_logwidth, src_logheight) + 1;
mip_offsets.resize(max_lod);
// Calculate mipmaps buffer size
uint32_t dst_height = 1;
uint32_t dst_width = 0;
for (uint32_t lod = 0, w = src_width, h = src_height; lod < max_lod; ++lod) {
assert((w > 0) || (w > 0));
uint32_t pw = std::max<int>(w, 1);
uint32_t ph = std::max<int>(h, 1);
mip_offsets.at(lod) = dst_width;
dst_width += pw * ph;
w >>= 1;
h >>= 1;
}
// allocate mipmap
pDstPixels->resize(dst_width * 4);
// generate mipmaps
{
auto pSrc = reinterpret_cast<const uint32_t*>(pSrcPixels->data());
auto pDst = reinterpret_cast<uint32_t*>(pDstPixels->data());
// copy level 0
memcpy(pDst, pSrc, pSrcPixels->size());
assert(pSrcPixels->size() == 4 * src_width * src_height);
pSrc = pDst;
pDst += src_width * src_height;
// copy lower levels
for (uint32_t lod = 1, w = (src_width/2), h = (src_height/2); lod < max_lod;) {
assert((w > 0) || (w > 0));
uint32_t pw = std::max<int>(w, 1);
uint32_t ph = std::max<int>(h, 1);
for (uint32_t y = 0; y < pw; ++y) {
auto v0 = 2 * y;
auto v1 = 2 * y + ((ph > 1) ? 1 : 0);
auto pSrc0 = pSrc + v0 * (2 * pw);
auto pSrc1 = pSrc + v1 * (2 * pw);
for (uint32_t x = 0; x <pw; ++x) {
auto u0 = 2 * x;
auto u1 = 2 * x + ((pw > 1) ? 1 : 0);
auto c00 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc0 + u0);
auto c01 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc0 + u1);
auto c10 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc1 + u0);
auto c11 = Format::ConvertFrom<FORMAT_A8R8G8B8, false>(pSrc1 + u1);
const ColorARGB color((c00.a + c01.a + c10.a + c11.a+2) >> 2,
(c00.r + c01.r + c10.r + c11.r+2) >> 2,
(c00.g + c01.g + c10.g + c11.g+2) >> 2,
(c00.b + c01.b + c10.b + c11.b+2) >> 2);
uint32_t ncolor;
Format::ConvertTo<FORMAT_A8R8G8B8>(&ncolor, color);
pDst[x + y * pw] = ncolor;
}
}
++lod;
pSrc = pDst;
pDst += pw * ph;
w >>= 1;
h >>= 1;
}
assert((pDst - reinterpret_cast<uint32_t*>(pDstPixels->data())) == dst_width);
}
// convert destination image if needed
if (need_conversion) {
ConvertImage(dst_staging, dst_staging, dst_width, dst_height, FORMAT_A8R8G8B8, format);
}
uint32_t bpp = Format::GetInfo(format).BytePerPixel;
for (auto& offset : mip_offsets) {
offset *= bpp;
}
return 0;
} }

View File

@@ -1,14 +1,9 @@
#include <cstdint> #include <cstdint>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <bitmanip.h>
#include "surfacedesc.h" #include "surfacedesc.h"
#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
inline uint32_t ilog2 (uint32_t value) {
return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1;
}
int LoadImage(const char *filename, int LoadImage(const char *filename,
ePixelFormat format, ePixelFormat format,
std::vector<uint8_t> &pixels, std::vector<uint8_t> &pixels,
@@ -37,7 +32,14 @@ int ConvertImage(std::vector<uint8_t>& dst_pixels,
ePixelFormat src_format, ePixelFormat src_format,
ePixelFormat dst_format); ePixelFormat dst_format);
int GenerateMipmaps(std::vector<uint8_t>& dst_pixels,
std::vector<uint32_t>& mip_offsets,
const std::vector<uint8_t>& src_pixels,
ePixelFormat format,
uint32_t src_width,
uint32_t src_height);
void dump_image(const std::vector<uint8_t>& pixels, void dump_image(const std::vector<uint8_t>& pixels,
uint32_t width, uint32_t width,
uint32_t height, uint32_t height,
uint32_t bpp); uint32_t bpp);

View File

@@ -1,4 +1,5 @@
#include <stdio.h> #include <stdio.h>
#include <vx_print.h>
const int Num = 9; const int Num = 9;
const int Ans = 34; const int Ans = 34;
@@ -14,12 +15,12 @@ int main() {
int fib = fibonacci(Num); int fib = fibonacci(Num);
printf("fibonacci(%d) = %d\n", Num, fib); vx_printf("fibonacci(%d) = %d\n", Num, fib);
if (fib == Ans) { if (fib == Ans) {
printf("Passed!\n"); vx_printf("Passed!\n");
} else { } else {
printf("Failed! value=%d, expected=%d\n", fib, Ans); vx_printf("Failed! value=%d, expected=%d\n", fib, Ans);
errors = 1; errors = 1;
} }

View File

@@ -1,8 +1,9 @@
#include <stdio.h> #include <stdio.h>
#include <vx_print.h>
int main() int main()
{ {
printf("Hello World!\n"); vx_printf("Hello World!\n");
return 0; return 0;
} }