Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

3
hw/.gitignore vendored
View File

@@ -1 +1,2 @@
obj_dir/*
VX_config.h
VX_types.h

View File

@@ -1,12 +1,17 @@
RTL_DIR=./rtl
SCRIPT_DIR=./scripts
all: VX_config.h
all: config
config: VX_config.h VX_types.h
VX_config.h: $(RTL_DIR)/VX_config.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h
clean:
rm -f VX_config.h
VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
.PHONY: VX_config.h
clean:
rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <math.h>
#include <unordered_map>
@@ -5,167 +18,323 @@
#include <mutex>
#include <iostream>
#include <rvfloats.h>
#include <util.h>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result);
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result);
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
void dpi_fclss(bool enable, int a, int* result);
void dpi_fsgnj(bool enable, int a, int b, int* result);
void dpi_fsgnjn(bool enable, int a, int b, int* result);
void dpi_fsgnjx(bool enable, int a, int b, int* result);
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags);
void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
}
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fadd_s(a, b, (*frm & 0x7), fflags);
inline uint64_t nan_box(uint32_t value) {
#ifdef FPU_RV64F
return value | 0xffffffff00000000;
#else
return value;
#endif
}
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fsub_s(a, b, (*frm & 0x7), fflags);
inline bool is_nan_boxed(uint64_t value) {
#ifdef FPU_RV64F
return (uint32_t(value >> 32) == 0xffffffff);
#else
__unused (value);
return true;
#endif
}
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmul_s(a, b, (*frm & 0x7), fflags);
inline int64_t check_boxing(int64_t a) {
if (!is_nan_boxed(a)) {
return nan_box(0x7fc00000); // NaN
}
return a;
}
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmadd_s(a, b, c, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fadd_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fadd_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fmsub_s(a, b, c, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fsub_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fsub_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fnmadd_s(a, b, c, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fmul_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmul_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fnmsub_s(a, b, c, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fmadd_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
}
}
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fdiv_s(a, b, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fmsub_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
}
}
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fsqrt_s(a, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fnmadd_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fnmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
}
}
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_ftoi_s(a, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fnmsub_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fnmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
}
}
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_ftou_s(a, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fdiv_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fdiv_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_itof_s(a, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fsqrt_d(a, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fsqrt_s(check_boxing(a), (*frm & 0x7), fflags));
}
}
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_utof_s(a, (*frm & 0x7), fflags);
if (dst_fmt) {
if (src_fmt) {
*result = rv_ftol_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_ftol_s(check_boxing(a), (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
*result = sext<uint64_t>(rv_ftoi_d(a, (*frm & 0x7), fflags), 32);
} else {
*result = sext<uint64_t>(rv_ftoi_s(check_boxing(a), (*frm & 0x7), fflags), 32);
}
}
}
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_flt_s(a, b, fflags);
if (dst_fmt) {
if (src_fmt) {
*result = rv_ftolu_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_ftolu_s(check_boxing(a), (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
*result = sext<uint64_t>(rv_ftou_d(a, (*frm & 0x7), fflags), 32);
} else {
*result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32);
}
}
}
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fle_s(a, b, fflags);
if (dst_fmt) {
if (src_fmt) {
*result = rv_ltof_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_itof_d(a, (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
*result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags));
} else {
*result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags));
}
}
}
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_feq_s(a, b, fflags);
if (dst_fmt) {
if (src_fmt) {
*result = rv_lutof_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_utof_d(a, (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
*result = nan_box(rv_lutof_s(a, (*frm & 0x7), fflags));
} else {
*result = nan_box(rv_utof_s(a, (*frm & 0x7), fflags));
}
}
}
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
if (!enable)
return;
*result = rv_fmin_s(a, b, fflags);
if (dst_fmt) {
*result = rv_ftod((int32_t)check_boxing(a));
} else {
*result = nan_box(rv_dtof(a));
}
}
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result) {
if (!enable)
return;
*result = rv_fmax_s(a, b, fflags);
if (dst_fmt) {
*result = rv_fclss_d(a);
} else {
*result = rv_fclss_s(check_boxing(a));
}
}
void dpi_fclss(bool enable, int a, int* result) {
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
return;
*result = rv_fclss_s(a);
if (dst_fmt) {
*result = rv_fsgnj_d(a, b);
} else {
*result = nan_box(rv_fsgnj_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_fsgnj(bool enable, int a, int b, int* result) {
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
return;
*result = rv_fsgnj_s(a, b);
if (dst_fmt) {
*result = rv_fsgnjn_d(a, b);
} else {
*result = nan_box(rv_fsgnjn_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_fsgnjn(bool enable, int a, int b, int* result) {
void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
return;
*result = rv_fsgnjn_s(a, b);
if (dst_fmt) {
*result = rv_fsgnjx_d(a, b);
} else {
*result = nan_box(rv_fsgnjx_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_fsgnjx(bool enable, int a, int b, int* result) {
void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
*result = rv_fsgnjx_s(a, b);
if (dst_fmt) {
*result = rv_flt_d(a, b, fflags);
} else {
*result = rv_flt_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
if (dst_fmt) {
*result = rv_fle_d(a, b, fflags);
} else {
*result = rv_fle_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
if (dst_fmt) {
*result = rv_feq_d(a, b, fflags);
} else {
*result = rv_feq_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
if (dst_fmt) {
*result = rv_fmin_d(a, b, fflags);
} else {
*result = nan_box(rv_fmin_s(check_boxing(a), check_boxing(b), fflags));
}
}
void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
return;
if (dst_fmt) {
*result = rv_fmax_d(a, b, fflags);
} else {
*result = nan_box(rv_fmax_s(check_boxing(a), check_boxing(b), fflags));
}
}

View File

@@ -1,31 +1,47 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import "DPI-C" function void dpi_fadd(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
`ifndef FLOAT_DPI_VH
`define FLOAT_DPI_VH
import "DPI-C" function void dpi_fdiv(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsqrt(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
`include "VX_config.vh"
import "DPI-C" function void dpi_ftoi(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_ftou(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_itof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_utof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmadd(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmsub(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmadd(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fnmsub(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fclss(input logic enable, input int a, output int result);
import "DPI-C" function void dpi_fsgnj(input logic enable, input int a, input int b, output int result);
import "DPI-C" function void dpi_fsgnjn(input logic enable, input int a, input int b, output int result);
import "DPI-C" function void dpi_fsgnjx(input logic enable, input int a, input int b, output int result);
import "DPI-C" function void dpi_fdiv(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsqrt(input logic enable, input int dst_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_flt(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fle(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_feq(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmin(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmax(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" function void dpi_ftoi(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_ftou(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_itof(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_utof(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_f2f(input logic enable, input int dst_fmt, input longint a, output longint result);
`endif
import "DPI-C" function void dpi_fclss(input logic enable, input int dst_fmt, input longint a, output longint result);
import "DPI-C" function void dpi_fsgnj(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
import "DPI-C" function void dpi_fsgnjn(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
import "DPI-C" function void dpi_fsgnjx(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
import "DPI-C" function void dpi_flt(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fle(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_feq(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmin(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmax(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
`endif

View File

@@ -1,23 +1,57 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <math.h>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <iostream>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
#include "uuid_gen.h"
#ifdef XLEN_64
#define iword_t int64_t
#define uword_t uint64_t
#define idword_t __int128_t
#define udword_t __uint128_t
#else
#define iword_t int32_t
#define uword_t uint32_t
#define idword_t int64_t
#define udword_t uint64_t
#endif
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 3
#endif
extern "C" {
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth);
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder);
void dpi_imul(bool enable, bool is_signed_a, bool is_signed_b, iword_t a, iword_t b, iword_t* resultl, iword_t* resulth);
void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotient, iword_t* remainder);
int dpi_register();
void dpi_assert(int inst, bool cond, int delay);
void dpi_trace(const char* format, ...);
void dpi_trace(int level, const char* format, ...);
void dpi_trace_start();
void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC);
}
bool sim_trace_enabled();
@@ -93,49 +127,54 @@ void dpi_assert(int inst, bool cond, int delay) {
}
}
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth) {
///////////////////////////////////////////////////////////////////////////////
void dpi_imul(bool enable, bool is_signed_a, bool is_signed_b, iword_t a, iword_t b, iword_t* resultl, iword_t* resulth) {
if (!enable)
return;
udword_t first = *(uword_t*)&a;
udword_t second = *(uword_t*)&b;
udword_t mask = udword_t(-1) << (8 * sizeof(iword_t));
uint64_t first = *(uint32_t*)&a;
uint64_t second = *(uint32_t*)&b;
if (is_signed_a && (first & 0x80000000)) {
first |= 0xFFFFFFFF00000000;
if (is_signed_a && a < 0) {
first |= mask;
}
if (is_signed_b && (second & 0x80000000)) {
second |= 0xFFFFFFFF00000000;
if (is_signed_b && b < 0) {
second |= mask;
}
uint64_t result;
udword_t result;
if (is_signed_a || is_signed_b) {
result = (int64_t)first * (int64_t)second;
result = idword_t(first) * idword_t(second);
} else {
result = first * second;
}
*resultl = result & 0xFFFFFFFF;
*resulth = (result >> 32) & 0xFFFFFFFF;
}
*resultl = iword_t(result);
*resulth = iword_t(result >> (8 * sizeof(iword_t)));
}
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder) {
void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotient, iword_t* remainder) {
if (!enable)
return;
uint32_t dividen = *(uint32_t*)&a;
uint32_t divisor = *(uint32_t*)&b;
uword_t dividen = a;
uword_t divisor = b;
auto inf_neg = uword_t(1) << (XLEN-1);
if (is_signed) {
if (b == 0) {
*quotient = -1;
*remainder = dividen;
} else if (dividen == 0x80000000 && divisor == 0xffffffff) {
} else if (dividen == inf_neg && divisor == -1) {
*remainder = 0;
*quotient = dividen;
} else {
*quotient = (int32_t)dividen / (int32_t)divisor;
*remainder = (int32_t)dividen % (int32_t)divisor;
*quotient = (iword_t)dividen / (iword_t)divisor;
*remainder = (iword_t)dividen % (iword_t)divisor;
}
} else {
if (b == 0) {
@@ -148,7 +187,11 @@ void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* rem
}
}
void dpi_trace(const char* format, ...) {
///////////////////////////////////////////////////////////////////////////////
void dpi_trace(int level, const char* format, ...) {
if (level > DEBUG_LEVEL)
return;
if (!sim_trace_enabled())
return;
va_list va;
@@ -163,4 +206,28 @@ void dpi_trace_start() {
void dpi_trace_stop() {
sim_trace_enable(false);
}
///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, std::shared_ptr<vortex::UUIDGenerator>> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC) {
if (reset) {
g_uuid_gens.clear();
return 0;
}
std::shared_ptr<vortex::UUIDGenerator> uuid_gen;
auto it = g_uuid_gens.find(wid);
if (it == g_uuid_gens.end()) {
uuid_gen = std::make_shared<vortex::UUIDGenerator>();
g_uuid_gens.emplace(wid, uuid_gen);
} else {
uuid_gen = it->second;
}
uint32_t instr_uuid = uuid_gen->get_uuid(PC);
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (wid << 16) | instr_id;
return uuid;
}

View File

@@ -1,14 +1,37 @@
`ifndef UTIL_DPI
`define UTIL_DPI
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import "DPI-C" function void dpi_imul(input logic enable, input int a, input int b, input logic is_signed_a, input logic is_signed_b, output int resultl, output int resulth);
import "DPI-C" function void dpi_idiv(input logic enable, input int a, input int b, input logic is_signed, output int quotient, output int remainder);
`ifndef UTIL_DPI_VH
`define UTIL_DPI_VH
`include "VX_config.vh"
`ifdef XLEN_64
`define INT_TYPE longint
`else
`define INT_TYPE int
`endif
import "DPI-C" function void dpi_imul(input logic enable, input logic is_signed_a, input logic is_signed_b, input `INT_TYPE a, input `INT_TYPE b, output `INT_TYPE resultl, output `INT_TYPE resulth);
import "DPI-C" function void dpi_idiv(input logic enable, input logic is_signed, input `INT_TYPE a, input `INT_TYPE b, output `INT_TYPE quotient, output `INT_TYPE remainder);
import "DPI-C" function int dpi_register();
import "DPI-C" function void dpi_assert(int inst, input logic cond, input int delay);
import "DPI-C" function void dpi_trace(input string format /*verilator sformat*/);
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
`endif
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid, input longint PC);
`endif

1
hw/rtl/.gitignore vendored
View File

@@ -1 +0,0 @@
/VX_user_config.vh

View File

@@ -1,235 +0,0 @@
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Inputs
VX_alu_req_if.slave alu_req_if,
// Outputs
VX_branch_ctl_if.master branch_ctl_if,
VX_commit_if.master alu_commit_if
);
`UNUSED_PARAM (CORE_ID)
reg [`NUM_THREADS-1:0][31:0] alu_result;
wire [`NUM_THREADS-1:0][31:0] add_result;
wire [`NUM_THREADS-1:0][32:0] sub_result;
wire [`NUM_THREADS-1:0][31:0] shr_result;
reg [`NUM_THREADS-1:0][31:0] msc_result;
wire ready_in;
`UNUSED_VAR (alu_req_if.op_mod)
wire is_br_op = `INST_ALU_IS_BR(alu_req_if.op_mod);
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(alu_req_if.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(alu_req_if.op_type);
wire alu_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] alu_op_class = `INST_ALU_OP_CLASS(alu_op);
wire is_sub = (alu_op == `INST_ALU_SUB);
wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data;
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.use_PC ? {`NUM_THREADS{alu_req_if.PC}} : alu_in1;
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.use_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.use_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]};
assign sub_result[i] = sub_in1 - sub_in2;
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] shr_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
assign shr_result[i] = 32'($signed(shr_in1) >>> alu_in2_imm[i][4:0]);
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
always @(*) begin
case (alu_op)
`INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i];
`INST_ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i];
`INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
//`INST_ALU_SLL,
default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
endcase
end
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
always @(*) begin
case (alu_op_class)
2'b00: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
2'b01: alu_result[i] = {31'b0, sub_result[i][32]}; // SLTU, SLT
2'b10: alu_result[i] = is_sub ? sub_result[i][31:0] // SUB
: shr_result[i]; // SRL, SRA
// 2'b11,
default: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL
endcase
end
end
// branch
wire is_jal = is_br_op && (br_op == `INST_BR_JAL || br_op == `INST_BR_JALR);
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
wire [31:0] br_dest = add_result[alu_req_if.tid];
wire [32:0] cmp_result = sub_result[alu_req_if.tid];
wire is_less = cmp_result[32];
wire is_equal = ~(| cmp_result[31:0]);
// output
wire alu_valid_in;
wire alu_ready_in;
wire alu_valid_out;
wire alu_ready_out;
wire [`UUID_BITS-1:0] alu_uuid;
wire [`NW_BITS-1:0] alu_wid;
wire [`NUM_THREADS-1:0] alu_tmask;
wire [31:0] alu_PC;
wire [`NR_BITS-1:0] alu_rd;
wire alu_wb;
wire [`NUM_THREADS-1:0][31:0] alu_data;
wire [`INST_BR_BITS-1:0] br_op_r;
wire [31:0] br_dest_r;
wire is_less_r;
wire is_equal_r;
wire is_br_op_r;
assign alu_ready_in = alu_ready_out || ~alu_valid_out;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (alu_ready_in),
.data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
.data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
);
`UNUSED_VAR (br_op_r)
wire br_neg = `INST_BR_NEG(br_op_r);
wire br_less = `INST_BR_LESS(br_op_r);
wire br_static = `INST_BR_STATIC(br_op_r);
assign branch_ctl_if.valid = alu_valid_out && alu_ready_out && is_br_op_r;
assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
assign branch_ctl_if.wid = alu_wid;
assign branch_ctl_if.dest = br_dest_r;
`ifdef EXT_M_ENABLE
wire mul_valid_in;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
wire [`UUID_BITS-1:0] mul_uuid;
wire [`NW_BITS-1:0] mul_wid;
wire [`NUM_THREADS-1:0] mul_tmask;
wire [31:0] mul_PC;
wire [`NR_BITS-1:0] mul_rd;
wire mul_wb;
wire [`NUM_THREADS-1:0][31:0] mul_data;
wire [`INST_MUL_BITS-1:0] mul_op = `INST_MUL_BITS'(alu_req_if.op_type);
VX_muldiv muldiv (
.clk (clk),
.reset (reset),
// Inputs
.alu_op (mul_op),
.uuid_in (alu_req_if.uuid),
.wid_in (alu_req_if.wid),
.tmask_in (alu_req_if.tmask),
.PC_in (alu_req_if.PC),
.rd_in (alu_req_if.rd),
.wb_in (alu_req_if.wb),
.alu_in1 (alu_req_if.rs1_data),
.alu_in2 (alu_req_if.rs2_data),
// Outputs
.wid_out (mul_wid),
.uuid_out (mul_uuid),
.tmask_out (mul_tmask),
.PC_out (mul_PC),
.rd_out (mul_rd),
.wb_out (mul_wb),
.data_out (mul_data),
// handshake
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out)
);
wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod);
assign ready_in = is_mul_op ? mul_ready_in : alu_ready_in;
assign alu_valid_in = alu_req_if.valid && ~is_mul_op;
assign mul_valid_in = alu_req_if.valid && is_mul_op;
assign alu_commit_if.valid = alu_valid_out || mul_valid_out;
assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid;
assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid;
assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask;
assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC;
assign alu_commit_if.rd = alu_valid_out ? alu_rd : mul_rd;
assign alu_commit_if.wb = alu_valid_out ? alu_wb : mul_wb;
assign alu_commit_if.data = alu_valid_out ? alu_data : mul_data;
assign alu_ready_out = alu_commit_if.ready;
assign mul_ready_out = alu_commit_if.ready & ~alu_valid_out; // ALU takes priority
`else
assign ready_in = alu_ready_in;
assign alu_valid_in = alu_req_if.valid;
assign alu_commit_if.valid = alu_valid_out;
assign alu_commit_if.uuid = alu_uuid;
assign alu_commit_if.wid = alu_wid;
assign alu_commit_if.tmask = alu_tmask;
assign alu_commit_if.PC = alu_PC;
assign alu_commit_if.rd = alu_rd;
assign alu_commit_if.wb = alu_wb;
assign alu_commit_if.data = alu_data;
assign alu_ready_out = alu_commit_if.ready;
`endif
assign alu_commit_if.eop = 1'b1;
// can accept new request?
assign alu_req_if.ready = ready_in;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid);
end
end
`endif
endmodule

View File

@@ -1,159 +0,0 @@
`include "VX_define.vh"
module VX_cache_arb #(
parameter NUM_REQS = 1,
parameter LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "R",
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
localparam DATA_WIDTH = (8 * DATA_SIZE),
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
) (
input wire clk,
input wire reset,
// input requests
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
// output request
output wire [LANES-1:0] req_valid_out,
output wire [LANES-1:0] req_rw_out,
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
input wire [LANES-1:0] req_ready_out,
// input response
input wire rsp_valid_in,
input wire [LANES-1:0] rsp_tmask_in,
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
output wire rsp_ready_in,
// output responses
output wire [NUM_REQS-1:0] rsp_valid_out,
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
input wire [NUM_REQS-1:0] rsp_ready_out
);
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
for (genvar i = 0; i < NUM_REQS; i++) begin
for (genvar j = 0; j < LANES; ++j) begin
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
VX_bits_insert #(
.N (TAG_IN_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (req_tag_in[i][j]),
.sel_in (LOG_NUM_REQS'(i)),
.data_out (req_tag_in_w)
);
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
end
end
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.LANES (LANES),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ),
.TYPE (TYPE)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (req_valid_in),
.data_in (req_data_in_merged),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_data_out_merged),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < LANES; ++i) begin
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
end
///////////////////////////////////////////////////////////////////////
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
VX_bits_remove #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (rsp_tag_in),
.data_out (rsp_tag_in_w)
);
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.LANES (1),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP)
) rsp_demux (
.clk (clk),
.reset (reset),
.sel_in (rsp_sel),
.valid_in (rsp_valid_in),
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out_merged),
.ready_out (rsp_ready_out)
);
for (genvar i = 0; i < NUM_REQS; i++) begin
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign req_valid_out = req_valid_in;
assign req_tag_out = req_tag_in;
assign req_addr_out = req_addr_in;
assign req_rw_out = req_rw_in;
assign req_byteen_out = req_byteen_in;
assign req_data_out = req_data_in;
assign req_ready_in = req_ready_out;
assign rsp_valid_out = rsp_valid_in;
assign rsp_tmask_out = rsp_tmask_in;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
end
endmodule

View File

@@ -1,195 +1,155 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_cluster #(
module VX_cluster import VX_gpu_pkg::*; #(
parameter CLUSTER_ID = 0
) (
`SCOPE_IO_VX_cluster
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`L2_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`L2_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`L2_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`L2_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
`ifdef PERF_ENABLE
VX_mem_perf_if.master mem_perf_if,
VX_mem_perf_if.slave perf_memsys_total_if,
`endif
// Memory response
input wire mem_rsp_valid,
input wire [`L2_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`L2_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if,
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`STATIC_ASSERT((`L2_ENABLE == 0 || `NUM_CORES > 1), ("invalid parameter"))
output wire busy
);
wire [`NUM_CORES-1:0] per_core_mem_req_valid;
wire [`NUM_CORES-1:0] per_core_mem_req_rw;
wire [`NUM_CORES-1:0][`DCACHE_MEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
wire [`NUM_CORES-1:0][`DCACHE_MEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_req_data;
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
wire [`NUM_CORES-1:0] per_core_mem_req_ready;
`ifdef SCOPE
localparam scope_socket = 0;
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
`endif
wire [`NUM_CORES-1:0] per_core_mem_rsp_valid;
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_rsp_data;
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
`ifdef GBAR_ENABLE
wire [`NUM_CORES-1:0] per_core_busy;
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
VX_gbar_bus_if gbar_bus_if();
for (genvar i = 0; i < `NUM_CORES; i++) begin
`RESET_RELAY (gbar_reset, reset);
`RESET_RELAY (core_reset);
VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS),
.OUT_REG ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb (
.clk (clk),
.reset (gbar_reset),
.bus_in_if (per_socket_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
VX_core #(
.CORE_ID(i + (CLUSTER_ID * `NUM_CORES))
) core (
`SCOPE_BIND_VX_cluster_core(i)
VX_gbar_unit #(
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
) gbar_unit (
.clk (clk),
.reset (gbar_reset),
.gbar_bus_if (gbar_bus_if)
);
`endif
.clk (clk),
.reset (core_reset),
.mem_req_valid (per_core_mem_req_valid[i]),
.mem_req_rw (per_core_mem_req_rw [i]),
.mem_req_byteen (per_core_mem_req_byteen[i]),
.mem_req_addr (per_core_mem_req_addr [i]),
.mem_req_data (per_core_mem_req_data [i]),
.mem_req_tag (per_core_mem_req_tag [i]),
.mem_req_ready (per_core_mem_req_ready[i]),
.mem_rsp_valid (per_core_mem_rsp_valid[i]),
.mem_rsp_data (per_core_mem_rsp_data [i]),
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
.busy (per_core_busy [i])
);
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
) per_socket_dcache_bus_if[`NUM_SOCKETS * DCACHE_NUM_REQS]();
assign busy = (| per_core_busy);
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
) per_socket_icache_bus_if[`NUM_SOCKETS]();
`RESET_RELAY (mem_unit_reset, reset);
VX_mem_unit #(
.CLUSTER_ID (CLUSTER_ID)
) mem_unit (
.clk (clk),
.reset (mem_unit_reset),
if (`L2_ENABLE) begin
`ifdef PERF_ENABLE
VX_perf_cache_if perf_l2cache_if();
.mem_perf_if (mem_perf_if),
`endif
`RESET_RELAY (l2_reset);
.dcache_bus_if (per_socket_dcache_bus_if),
.icache_bus_if (per_socket_icache_bus_if),
VX_cache #(
.CACHE_ID (`L2_CACHE_ID),
.CACHE_SIZE (`L2_CACHE_SIZE),
.CACHE_LINE_SIZE (`L2_CACHE_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
.NUM_PORTS (`L2_NUM_PORTS),
.WORD_SIZE (`L2_WORD_SIZE),
.NUM_REQS (`L2_NUM_REQS),
.CREQ_SIZE (`L2_CREQ_SIZE),
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`L1_MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.MEM_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) l2cache (
`SCOPE_BIND_VX_cluster_l2cache
.clk (clk),
.reset (l2_reset),
.mem_bus_if (mem_bus_if)
);
///////////////////////////////////////////////////////////////////////////
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
wire [`NUM_SOCKETS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_socket_sim_wb_value;
assign sim_ebreak = per_socket_sim_ebreak[0];
assign sim_wb_value = per_socket_sim_wb_value[0];
`UNUSED_VAR (per_socket_sim_ebreak)
`UNUSED_VAR (per_socket_sim_wb_value)
VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
wire [`NUM_SOCKETS-1:0] per_socket_busy;
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin
`RESET_RELAY (socket_reset, reset);
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
) socket (
`SCOPE_IO_BIND (scope_socket+i)
.clk (clk),
.reset (socket_reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_l2cache_if),
.mem_perf_if (perf_memsys_total_if),
`endif
.dcr_bus_if (socket_dcr_bus_if),
.dcache_bus_if (per_socket_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_socket_icache_bus_if[i]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),
`endif
// Core request
.core_req_valid (per_core_mem_req_valid),
.core_req_rw (per_core_mem_req_rw),
.core_req_byteen (per_core_mem_req_byteen),
.core_req_addr (per_core_mem_req_addr),
.core_req_data (per_core_mem_req_data),
.core_req_tag (per_core_mem_req_tag),
.core_req_ready (per_core_mem_req_ready),
// Core response
.core_rsp_valid (per_core_mem_rsp_valid),
.core_rsp_data (per_core_mem_rsp_data),
.core_rsp_tag (per_core_mem_rsp_tag),
.core_rsp_ready (per_core_mem_rsp_ready),
`UNUSED_PIN (core_rsp_tmask),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_ready (mem_rsp_ready)
.sim_ebreak (per_socket_sim_ebreak[i]),
.sim_wb_value (per_socket_sim_wb_value[i]),
.busy (per_socket_busy[i])
);
end else begin
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L1_MEM_TAG_WIDTH),
.TYPE ("R"),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
// Core request
.req_valid_in (per_core_mem_req_valid),
.req_rw_in (per_core_mem_req_rw),
.req_byteen_in (per_core_mem_req_byteen),
.req_addr_in (per_core_mem_req_addr),
.req_data_in (per_core_mem_req_data),
.req_tag_in (per_core_mem_req_tag),
.req_ready_in (per_core_mem_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Core response
.rsp_valid_out (per_core_mem_rsp_valid),
.rsp_data_out (per_core_mem_rsp_data),
.rsp_tag_out (per_core_mem_rsp_tag),
.rsp_ready_out (per_core_mem_rsp_ready),
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
end
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1));
endmodule

View File

@@ -1,138 +0,0 @@
`include "VX_define.vh"
module VX_commit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if,
VX_commit_if.slave ld_commit_if,
VX_commit_if.slave st_commit_if,
VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if,
`endif
VX_commit_if.slave gpu_commit_if,
// outputs
VX_writeback_if.master writeback_if,
VX_cmt_to_csr_if.master cmt_to_csr_if
);
// CSRs update
wire alu_commit_fire = alu_commit_if.valid && alu_commit_if.ready;
wire ld_commit_fire = ld_commit_if.valid && ld_commit_if.ready;
wire st_commit_fire = st_commit_if.valid && st_commit_if.ready;
wire csr_commit_fire = csr_commit_if.valid && csr_commit_if.ready;
`ifdef EXT_F_ENABLE
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
`endif
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
wire commit_fire = alu_commit_fire
|| ld_commit_fire
|| st_commit_fire
|| csr_commit_fire
`ifdef EXT_F_ENABLE
|| fpu_commit_fire
`endif
|| gpu_commit_fire;
`ifdef EXT_F_ENABLE
wire [(6*`NUM_THREADS)-1:0] commit_tmask;
`else
wire [(5*`NUM_THREADS)-1:0] commit_tmask;
`endif
wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size;
assign commit_tmask = {
{`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask,
{`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask,
{`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask,
{`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask,
`ifdef EXT_F_ENABLE
{`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask,
`endif
{`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask
};
`POP_COUNT(commit_size, commit_tmask);
VX_pipe_register #(
.DATAW (1 + $bits(commit_size)),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire, commit_size}),
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
);
// Writeback
VX_writeback #(
.CORE_ID(CORE_ID)
) writeback (
.clk (clk),
.reset (reset),
.alu_commit_if (alu_commit_if),
.ld_commit_if (ld_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if)
);
// store and gpu commits don't writeback
assign st_commit_if.ready = 1'b1;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (alu_commit_if.valid && alu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
`TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", alu_commit_if.uuid);
end
if (ld_commit_if.valid && ld_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
`TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", ld_commit_if.uuid);
end
if (st_commit_if.valid && st_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid);
end
if (csr_commit_if.valid && csr_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
`TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", csr_commit_if.uuid);
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
`TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", fpu_commit_if.uuid);
end
`endif
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
`TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", gpu_commit_if.uuid);
end
end
`endif
endmodule

View File

@@ -1,7 +1,49 @@
`ifndef VX_CONFIG
`define VX_CONFIG
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef XLEN
`ifndef VX_CONFIG_VH
`define VX_CONFIG_VH
`ifndef MIN
`define MIN(x, y) (((x) < (y)) ? (x) : (y))
`endif
`ifndef MAX
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
`endif
`ifndef CLAMP
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`endif
`ifndef UP
`define UP(x) (((x) != 0) ? (x) : 1)
`endif
///////////////////////////////////////////////////////////////////////////////
// 32 bit XLEN as default.
`ifndef XLEN_32
`ifndef XLEN_64
`define XLEN_32
`endif
`endif
`ifdef XLEN_64
`define XLEN 64
`endif
`ifdef XLEN_32
`define XLEN 32
`endif
@@ -25,54 +67,127 @@
`define NUM_BARRIERS 4
`endif
`ifndef L2_ENABLE
`define L2_ENABLE 0
`ifndef SOCKET_SIZE
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
`ifndef L3_ENABLE
`define L3_ENABLE 0
`ifdef L2_ENABLE
`define L2_ENABLED 1
`else
`define L2_ENABLED 0
`endif
`ifndef SM_ENABLE
`define SM_ENABLE 1
`ifdef L3_ENABLE
`define L3_ENABLED 1
`else
`define L3_ENABLED 0
`endif
`ifdef L1_DISABLE
`define ICACHE_DISABLE
`define DCACHE_DISABLE
`endif
`ifndef MEM_BLOCK_SIZE
`define MEM_BLOCK_SIZE 64
`endif
`ifndef L1_BLOCK_SIZE
`define L1_BLOCK_SIZE ((`L2_ENABLE || `L3_ENABLE) ? 16 : `MEM_BLOCK_SIZE)
`ifndef MEM_ADDR_WIDTH
`ifdef XLEN_64
`define MEM_ADDR_WIDTH 48
`else
`define MEM_ADDR_WIDTH 32
`endif
`endif
`ifndef L1_LINE_SIZE
`ifdef L1_DISABLE
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
`else
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
`endif
`endif
`ifdef XLEN_64
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 64'h180000000
`endif
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 64'h1FF000000
`endif
`else
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 32'h80000000
`endif
`ifndef IO_BASE_ADDR
`define IO_BASE_ADDR 32'hFF000000
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 32'hFF000000
`endif
`ifndef IO_ADDR_SIZE
`define IO_ADDR_SIZE (32'hFFFFFFFF - `IO_BASE_ADDR + 1)
`endif
`ifndef IO_COUT_ADDR
`define IO_COUT_ADDR (32'hFFFFFFFF - `MEM_BLOCK_SIZE + 1)
`endif
`ifndef IO_COUT_SIZE
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef IO_CSR_ADDR
`define IO_CSR_ADDR `IO_BASE_ADDR
`endif
`ifndef SMEM_BASE_ADDR
`define SMEM_BASE_ADDR `IO_BASE_ADDR
`define SMEM_BASE_ADDR `STACK_BASE_ADDR
`endif
`ifndef SMEM_LOG_SIZE
`define SMEM_LOG_SIZE 14
`endif
`ifndef IO_BASE_ADDR
`define IO_BASE_ADDR (`SMEM_BASE_ADDR + (1 << `SMEM_LOG_SIZE))
`endif
`ifndef IO_COUT_ADDR
`define IO_COUT_ADDR `IO_BASE_ADDR
`endif
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
`ifndef IO_CSR_ADDR
`define IO_CSR_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
`endif
`define IO_CSR_SIZE (4 * 64 * `NUM_CORES * `NUM_CLUSTERS)
`ifndef STACK_LOG2_SIZE
`define STACK_LOG2_SIZE 13
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define RESET_DELAY 8
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef FPU_FPNEW
`ifndef FPU_DSP
`ifndef FPU_DPI
`ifdef SYNTHESIS
`define FPU_DSP
`else
`define FPU_DPI
`endif
`endif
`endif
`endif
`ifndef SYNTHESIS
`ifndef DPI_DISABLE
`define IMUL_DPI
`define IDIV_DPI
`endif
`endif
`ifndef DEBUG_LEVEL
`define DEBUG_LEVEL 3
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifndef EXT_M_DISABLE
`define EXT_M_ENABLE
`endif
@@ -81,230 +196,278 @@
`define EXT_F_ENABLE
`endif
// Device identification
`ifdef EXT_D_ENABLE
`define FLEN_64
`else
`define FLEN_32
`endif
`ifdef FLEN_64
`define FLEN 64
`endif
`ifdef FLEN_32
`define FLEN 32
`endif
`ifdef XLEN_64
`ifdef FLEN_32
`define FPU_RV64F
`endif
`endif
`define ISA_STD_A 0
`define ISA_STD_C 2
`define ISA_STD_D 3
`define ISA_STD_E 4
`define ISA_STD_F 5
`define ISA_STD_H 7
`define ISA_STD_I 8
`define ISA_STD_N 13
`define ISA_STD_Q 16
`define ISA_STD_S 18
`define ISA_STD_U 20
`define ISA_EXT_TEX 0
`define ISA_EXT_RASTER 1
`define ISA_EXT_ROP 2
`ifdef EXT_A_ENABLE
`define EXT_A_ENABLED 1
`else
`define EXT_A_ENABLED 0
`endif
`ifdef EXT_C_ENABLE
`define EXT_C_ENABLED 1
`else
`define EXT_C_ENABLED 0
`endif
`ifdef EXT_D_ENABLE
`define EXT_D_ENABLED 1
`else
`define EXT_D_ENABLED 0
`endif
`ifdef EXT_F_ENABLE
`define EXT_F_ENABLED 1
`else
`define EXT_F_ENABLED 0
`endif
`ifdef EXT_M_ENABLE
`define EXT_M_ENABLED 1
`else
`define EXT_M_ENABLED 0
`endif
`define ISA_X_ENABLED 0
`define MISA_EXT 0
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
| (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \
| (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
| (0 << 4) /* E - RV32E base ISA */ \
| (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
| (0 << 6) /* G - Additional standard extensions present */ \
| (0 << 7) /* H - Hypervisor mode implemented */ \
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
| (0 << 9) /* J - Reserved */ \
| (0 << 10) /* K - Reserved */ \
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
| (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
| (0 << 13) /* N - User level interrupts supported */ \
| (0 << 14) /* O - Reserved */ \
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
| (0 << 17) /* R - Reserved */ \
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (`ISA_X_ENABLED << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \
| (0 << 25) /* Z - Reserved */
// Device identification //////////////////////////////////////////////////////
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0
`define IMPLEMENTATION_ID 0
///////////////////////////////////////////////////////////////////////////////
// Pipeline Configuration /////////////////////////////////////////////////////
`ifndef LATENCY_IMUL
`define LATENCY_IMUL 3
// Issue width
`ifndef ISSUE_WIDTH
`define ISSUE_WIDTH `MIN(`NUM_WARPS, 4)
`endif
`ifndef LATENCY_FNCP
`define LATENCY_FNCP 2
// Number of ALU units
`ifndef NUM_ALU_LANES
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
`endif
`ifndef NUM_ALU_BLOCKS
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`endif
`ifndef LATENCY_FMA
`define LATENCY_FMA 4
// Number of FPU units
`ifndef NUM_FPU_LANES
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
`endif
`ifndef NUM_FPU_BLOCKS
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`endif
`ifndef LATENCY_FDIV
`ifdef ALTERA_S10
`define LATENCY_FDIV 34
`else
`define LATENCY_FDIV 15
`endif
// Number of LSU units
`ifndef NUM_LSU_LANES
`define NUM_LSU_LANES `MIN(`NUM_THREADS, 4)
`endif
`ifndef LATENCY_FSQRT
`ifdef ALTERA_S10
`define LATENCY_FSQRT 25
`else
`define LATENCY_FSQRT 10
// Number of SFU units
`ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
`endif
`endif
`ifndef LATENCY_FDIVSQRT
`define LATENCY_FDIVSQRT 32
`endif
`ifndef LATENCY_FCVT
`define LATENCY_FCVT 5
`endif
`define RESET_DELAY 6
// CSR Addresses //////////////////////////////////////////////////////////////
// User Floating-Point CSRs
`define CSR_FFLAGS 12'h001
`define CSR_FRM 12'h002
`define CSR_FCSR 12'h003
`define CSR_SATP 12'h180
`define CSR_PMPCFG0 12'h3A0
`define CSR_PMPADDR0 12'h3B0
`define CSR_MSTATUS 12'h300
`define CSR_MISA 12'h301
`define CSR_MEDELEG 12'h302
`define CSR_MIDELEG 12'h303
`define CSR_MIE 12'h304
`define CSR_MTVEC 12'h305
`define CSR_MEPC 12'h341
// Machine Performance-monitoring counters
`define CSR_MPM_BASE 12'hB00
`define CSR_MPM_BASE_H 12'hB80
// PERF: pipeline
`define CSR_MCYCLE 12'hB00
`define CSR_MCYCLE_H 12'hB80
`define CSR_MPM_RESERVED 12'hB01
`define CSR_MPM_RESERVED_H 12'hB81
`define CSR_MINSTRET 12'hB02
`define CSR_MINSTRET_H 12'hB82
`define CSR_MPM_IBUF_ST 12'hB03
`define CSR_MPM_IBUF_ST_H 12'hB83
`define CSR_MPM_SCRB_ST 12'hB04
`define CSR_MPM_SCRB_ST_H 12'hB84
`define CSR_MPM_ALU_ST 12'hB05
`define CSR_MPM_ALU_ST_H 12'hB85
`define CSR_MPM_LSU_ST 12'hB06
`define CSR_MPM_LSU_ST_H 12'hB86
`define CSR_MPM_CSR_ST 12'hB07
`define CSR_MPM_CSR_ST_H 12'hB87
`define CSR_MPM_FPU_ST 12'hB08
`define CSR_MPM_FPU_ST_H 12'hB88
`define CSR_MPM_GPU_ST 12'hB09
`define CSR_MPM_GPU_ST_H 12'hB89
// PERF: decode
`define CSR_MPM_LOADS 12'hB0A
`define CSR_MPM_LOADS_H 12'hB8A
`define CSR_MPM_STORES 12'hB0B
`define CSR_MPM_STORES_H 12'hB8B
`define CSR_MPM_BRANCHES 12'hB0C
`define CSR_MPM_BRANCHES_H 12'hB8C
// PERF: icache
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB8D
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
// PERF: dcache
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB8F
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
// PERF: smem
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
`define CSR_MPM_SMEM_READS_H 12'hB95
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
`define CSR_MPM_SMEM_WRITES_H 12'hB96
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
// PERF: memory
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
`define CSR_MPM_MEM_READS_H 12'hB98
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
`define CSR_MPM_MEM_WRITES_H 12'hB99
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
`define CSR_MPM_MEM_LAT_H 12'hB9A
// PERF: texunit
`define CSR_MPM_TEX_READS 12'hB1B // texture accesses
`define CSR_MPM_TEX_READS_H 12'hB9B
`define CSR_MPM_TEX_LAT 12'hB1C // texture latency
`define CSR_MPM_TEX_LAT_H 12'hB9C
// Machine Information Registers
`define CSR_MVENDORID 12'hF11
`define CSR_MARCHID 12'hF12
`define CSR_MIMPID 12'hF13
`define CSR_MHARTID 12'hF14
// User SIMT CSRs
`define CSR_WTID 12'hCC0
`define CSR_LTID 12'hCC1
`define CSR_GTID 12'hCC2
`define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5
`define CSR_TMASK 12'hCC4
// Machine SIMT CSRs
`define CSR_NT 12'hFC0
`define CSR_NW 12'hFC1
`define CSR_NC 12'hFC2
////////// Texture Units //////////////////////////////////////////////////////
`define NUM_TEX_UNITS 2
`define TEX_SUBPIXEL_BITS 8
`define TEX_DIM_BITS 15
`define TEX_LOD_MAX `TEX_DIM_BITS
`define TEX_LOD_BITS 4
`define TEX_FXD_BITS 32
`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS)
`define TEX_STATE_ADDR 0
`define TEX_STATE_WIDTH 1
`define TEX_STATE_HEIGHT 2
`define TEX_STATE_FORMAT 3
`define TEX_STATE_FILTER 4
`define TEX_STATE_WRAPU 5
`define TEX_STATE_WRAPV 6
`define TEX_STATE_MIPOFF(lod) (7+(lod))
`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1)
`define CSR_TEX_UNIT 12'hFD0
`define CSR_TEX_STATE_BEGIN 12'hFD1
`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR)
`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH)
`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT)
`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT)
`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER)
`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU)
`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV)
`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod))
`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN+`NUM_TEX_STATES)
`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN)
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of Instruction Buffer
`ifndef IBUF_SIZE
`define IBUF_SIZE 2
`define IBUF_SIZE (2 * (`NUM_WARPS / `ISSUE_WIDTH))
`endif
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE (`NUM_WARPS * 2)
`define LSUQ_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
`endif
// LSU Duplicate Address Check
`ifdef LSU_DUP
`define LSU_DUP_ENABLED 1
`else
`define LSU_DUP_ENABLED 0
`endif
`ifdef GBAR_ENABLE
`define GBAR_ENABLED 1
`else
`define GBAR_ENABLED 0
`endif
`ifndef LATENCY_IMUL
`ifdef VIVADO
`define LATENCY_IMUL 4
`endif
`ifdef QUARTUS
`define LATENCY_IMUL 3
`endif
`ifndef LATENCY_IMUL
`define LATENCY_IMUL 4
`endif
`endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
`ifndef FPUQ_SIZE
`define FPUQ_SIZE 8
`ifndef FPU_REQ_QUEUE_SIZE
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`endif
// Texture Unit Request Queue
`ifndef TEXQ_SIZE
`define TEXQ_SIZE (`NUM_WARPS * 2)
// FNCP Latency
`ifndef LATENCY_FNCP
`define LATENCY_FNCP 2
`endif
// FMA Latency
`ifndef LATENCY_FMA
`ifdef FPU_DPI
`define LATENCY_FMA 4
`endif
`ifdef FPU_FPNEW
`define LATENCY_FMA 4
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FMA 4
`endif
`ifdef VIVADO
`define LATENCY_FMA 16
`endif
`ifndef LATENCY_FMA
`define LATENCY_FMA 4
`endif
`endif
`endif
// FDIV Latency
`ifndef LATENCY_FDIV
`ifdef FPU_DPI
`define LATENCY_FDIV 15
`endif
`ifdef FPU_FPNEW
`define LATENCY_FDIV 16
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FDIV 15
`endif
`ifdef VIVADO
`define LATENCY_FDIV 28
`endif
`ifndef LATENCY_FDIV
`define LATENCY_FDIV 16
`endif
`endif
`endif
// FSQRT Latency
`ifndef LATENCY_FSQRT
`ifdef FPU_DPI
`define LATENCY_FSQRT 10
`endif
`ifdef FPU_FPNEW
`define LATENCY_FSQRT 16
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FSQRT 10
`endif
`ifdef VIVADO
`define LATENCY_FSQRT 28
`endif
`ifndef LATENCY_FSQRT
`define LATENCY_FSQRT 16
`endif
`endif
`endif
// FCVT Latency
`ifndef LATENCY_FCVT
`define LATENCY_FCVT 5
`endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
`ifndef ICACHE_SIZE
`define ICACHE_SIZE 16384
// Cache Enable
`ifndef ICACHE_DISABLE
`define ICACHE_ENABLE
`endif
`ifdef ICACHE_ENABLE
`define ICACHE_ENABLED 1
`else
`define ICACHE_ENABLED 0
`define NUM_ICACHES 0
`endif
// Core Request Queue Size
`ifndef ICACHE_CREQ_SIZE
`define ICACHE_CREQ_SIZE 0
// Number of Cache Units
`ifndef NUM_ICACHES
`define NUM_ICACHES `UP(`NUM_CORES / 4)
`endif
// Cache Size
`ifndef ICACHE_SIZE
`define ICACHE_SIZE 16384
`endif
// Core Response Queue Size
@@ -314,7 +477,7 @@
// Miss Handling Register Size
`ifndef ICACHE_MSHR_SIZE
`define ICACHE_MSHR_SIZE `NUM_WARPS
`define ICACHE_MSHR_SIZE 16
`endif
// Memory Request Queue Size
@@ -327,26 +490,38 @@
`define ICACHE_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 2
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
// Cache Enable
`ifndef DCACHE_DISABLE
`define DCACHE_ENABLE
`endif
`ifdef DCACHE_ENABLE
`define DCACHE_ENABLED 1
`else
`define DCACHE_ENABLED 0
`define NUM_DCACHES 0
`define DCACHE_NUM_BANKS 1
`endif
// Number of Cache Units
`ifndef NUM_DCACHES
`define NUM_DCACHES `UP(`NUM_CORES / 4)
`endif
// Cache Size
`ifndef DCACHE_SIZE
`define DCACHE_SIZE 16384
`endif
// Number of banks
// Number of Banks
`ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS `NUM_THREADS
`endif
// Number of ports per bank
`ifndef DCACHE_NUM_PORTS
`define DCACHE_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef DCACHE_CREQ_SIZE
`define DCACHE_CREQ_SIZE 0
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
`endif
// Core Response Queue Size
@@ -356,7 +531,7 @@
// Miss Handling Register Size
`ifndef DCACHE_MSHR_SIZE
`define DCACHE_MSHR_SIZE `LSUQ_SIZE
`define DCACHE_MSHR_SIZE 16
`endif
// Memory Request Queue Size
@@ -369,54 +544,42 @@
`define DCACHE_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 2
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// per thread stack size
`ifndef STACK_LOG2_SIZE
`define STACK_LOG2_SIZE 10
`ifndef SM_DISABLE
`define SM_ENABLE
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
// Size of cache in bytes
`ifndef SMEM_SIZE
`define SMEM_SIZE (`STACK_SIZE * `NUM_WARPS * `NUM_THREADS)
`ifdef SM_ENABLE
`define SM_ENABLED 1
`else
`define SM_ENABLED 0
`define SMEM_NUM_BANKS 1
`endif
// Number of banks
// Number of Banks
`ifndef SMEM_NUM_BANKS
`define SMEM_NUM_BANKS `NUM_THREADS
`endif
// Core Request Queue Size
`ifndef SMEM_CREQ_SIZE
`define SMEM_CREQ_SIZE 2
`endif
// Core Response Queue Size
`ifndef SMEM_CRSQ_SIZE
`define SMEM_CRSQ_SIZE 2
`define SMEM_NUM_BANKS (`NUM_LSU_LANES)
`endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
// Cache Size
`ifndef L2_CACHE_SIZE
`define L2_CACHE_SIZE 131072
`ifdef ALTERA_S10
`define L2_CACHE_SIZE 2097152
`else
`define L2_CACHE_SIZE 1048576
`endif
`endif
// Number of banks
// Number of Banks
`ifndef L2_NUM_BANKS
`define L2_NUM_BANKS ((`NUM_CORES < 4) ? `NUM_CORES : 4)
`endif
// Number of ports per bank
`ifndef L2_NUM_PORTS
`define L2_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef L2_CREQ_SIZE
`define L2_CREQ_SIZE 0
`define L2_NUM_BANKS 2
`endif
// Core Response Queue Size
@@ -439,26 +602,25 @@
`define L2_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 4
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
// Cache Size
`ifndef L3_CACHE_SIZE
`ifdef ALTERA_S10
`define L3_CACHE_SIZE 2097152
`else
`define L3_CACHE_SIZE 1048576
`endif
`endif
// Number of banks
// Number of Banks
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS ((`NUM_CLUSTERS < 4) ? `NUM_CORES : 4)
`endif
// Number of ports per bank
`ifndef L3_NUM_PORTS
`define L3_NUM_PORTS 1
`endif
// Core Request Queue Size
`ifndef L3_CREQ_SIZE
`define L3_CREQ_SIZE 0
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
`endif
// Core Response Queue Size
@@ -481,4 +643,9 @@
`define L3_MRSQ_SIZE 0
`endif
`endif
// Number of Associative Ways
`ifndef L3_NUM_WAYS
`define L3_NUM_WAYS 4
`endif
`endif // VX_CONFIG_VH

View File

@@ -1,156 +0,0 @@
`include "VX_define.vh"
module VX_core #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_core
// Clock
input wire clk,
input wire reset,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`DCACHE_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`DCACHE_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`L1_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory reponse
input wire mem_rsp_valid,
input wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`L1_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// Status
output wire busy
);
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if();
`endif
VX_mem_req_if #(
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
) mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
) mem_rsp_if();
assign mem_req_valid = mem_req_if.valid;
assign mem_req_rw = mem_req_if.rw;
assign mem_req_byteen= mem_req_if.byteen;
assign mem_req_addr = mem_req_if.addr;
assign mem_req_data = mem_req_if.data;
assign mem_req_tag = mem_req_if.tag;
assign mem_req_if.ready = mem_req_ready;
assign mem_rsp_if.valid = mem_rsp_valid;
assign mem_rsp_if.data = mem_rsp_data;
assign mem_rsp_if.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_rsp_if.ready;
//--
VX_dcache_req_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_rsp_if();
VX_icache_req_if #(
.WORD_SIZE (`ICACHE_WORD_SIZE),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_req_if();
VX_icache_rsp_if #(
.WORD_SIZE (`ICACHE_WORD_SIZE),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_rsp_if();
VX_pipeline #(
.CORE_ID(CORE_ID)
) pipeline (
`SCOPE_BIND_VX_core_pipeline
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
`endif
.clk(clk),
.reset(reset),
// Dcache core request
.dcache_req_valid (dcache_req_if.valid),
.dcache_req_rw (dcache_req_if.rw),
.dcache_req_byteen (dcache_req_if.byteen),
.dcache_req_addr (dcache_req_if.addr),
.dcache_req_data (dcache_req_if.data),
.dcache_req_tag (dcache_req_if.tag),
.dcache_req_ready (dcache_req_if.ready),
// Dcache core reponse
.dcache_rsp_valid (dcache_rsp_if.valid),
.dcache_rsp_tmask (dcache_rsp_if.tmask),
.dcache_rsp_data (dcache_rsp_if.data),
.dcache_rsp_tag (dcache_rsp_if.tag),
.dcache_rsp_ready (dcache_rsp_if.ready),
// Icache core request
.icache_req_valid (icache_req_if.valid),
.icache_req_addr (icache_req_if.addr),
.icache_req_tag (icache_req_if.tag),
.icache_req_ready (icache_req_if.ready),
// Icache core reponse
.icache_rsp_valid (icache_rsp_if.valid),
.icache_rsp_data (icache_rsp_if.data),
.icache_rsp_tag (icache_rsp_if.tag),
.icache_rsp_ready (icache_rsp_if.ready),
// Status
.busy(busy)
);
//--
VX_mem_unit #(
.CORE_ID(CORE_ID)
) mem_unit (
`SCOPE_BIND_VX_core_mem_unit
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
`endif
.clk (clk),
.reset (reset),
// Core <-> Dcache
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
// Core <-> Icache
.icache_req_if (icache_req_if),
.icache_rsp_if (icache_rsp_if),
// Memory
.mem_req_if (mem_req_if),
.mem_rsp_if (mem_rsp_if)
);
endmodule

View File

@@ -1,265 +0,0 @@
`include "VX_define.vh"
module VX_csr_data #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
`ifdef EXT_TEX_ENABLE
VX_perf_tex_if.slave perf_tex_if,
`endif
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
VX_cmt_to_csr_if.slave cmt_to_csr_if,
VX_fetch_to_csr_if.slave fetch_to_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
input wire read_enable,
input wire [`UUID_BITS-1:0] read_uuid,
input wire[`CSR_ADDR_BITS-1:0] read_addr,
input wire[`NW_BITS-1:0] read_wid,
output wire[31:0] read_data,
input wire write_enable,
input wire [`UUID_BITS-1:0] write_uuid,
input wire[`CSR_ADDR_BITS-1:0] write_addr,
input wire[`NW_BITS-1:0] write_wid,
input wire[31:0] write_data,
input wire busy
);
import fpu_types::*;
reg [`CSR_WIDTH-1:0] csr_satp;
reg [`CSR_WIDTH-1:0] csr_mstatus;
reg [`CSR_WIDTH-1:0] csr_medeleg;
reg [`CSR_WIDTH-1:0] csr_mideleg;
reg [`CSR_WIDTH-1:0] csr_mie;
reg [`CSR_WIDTH-1:0] csr_mtvec;
reg [`CSR_WIDTH-1:0] csr_mepc;
reg [`CSR_WIDTH-1:0] csr_pmpcfg [0:0];
reg [`CSR_WIDTH-1:0] csr_pmpaddr [0:0];
reg [63:0] csr_cycle;
reg [63:0] csr_instret;
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
always @(posedge clk) begin
if (reset) begin
fcsr <= '0;
end else begin
`ifdef EXT_F_ENABLE
if (fpu_to_csr_if.write_enable) begin
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
| fpu_to_csr_if.write_fflags;
end
`endif
if (write_enable) begin
case (write_addr)
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
default: begin
`ifdef EXT_TEX_ENABLE
`ASSERT((write_addr == `CSR_TEX_UNIT)
|| (write_addr >= `CSR_TEX_STATE_BEGIN
&& write_addr < `CSR_TEX_STATE_END),
("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
`else
`ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
`endif
end
endcase
end
end
end
`UNUSED_VAR (write_data)
// TEX CSRs
`ifdef EXT_TEX_ENABLE
assign tex_csr_if.write_enable = write_enable;
assign tex_csr_if.write_addr = write_addr;
assign tex_csr_if.write_data = write_data;
assign tex_csr_if.write_uuid = write_uuid;
`endif
always @(posedge clk) begin
if (reset) begin
csr_cycle <= 0;
csr_instret <= 0;
end else begin
if (busy) begin
csr_cycle <= csr_cycle + 1;
end
if (cmt_to_csr_if.valid) begin
csr_instret <= csr_instret + 64'(cmt_to_csr_if.commit_size);
end
end
end
reg [31:0] read_data_r;
reg read_addr_valid_r;
always @(*) begin
read_data_r = 'x;
read_addr_valid_r = 1;
case (read_addr)
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFLAGS_BITS-1:0]);
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS]);
`CSR_FCSR : read_data_r = 32'(fcsr[read_wid]);
`CSR_WTID ,
`CSR_LTID ,
`CSR_LWID : read_data_r = 32'(read_wid);
`CSR_GTID ,
/*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]);
`CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
`CSR_MCYCLE : read_data_r = csr_cycle[31:0];
`CSR_MCYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
`CSR_MINSTRET : read_data_r = csr_instret[31:0];
`CSR_MINSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
`ifdef PERF_ENABLE
// PERF: pipeline
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
`else
`CSR_MPM_FPU_ST : read_data_r = '0;
`CSR_MPM_FPU_ST_H : read_data_r = '0;
`endif
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
// PERF: decode
`CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0];
`CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0];
`CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
`CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0];
`CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
// PERF: icache
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
`ifdef EXT_TEX_ENABLE
// PERF: texunit
`CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0];
`CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0];
`CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]);
`endif
// PERF: reserved
`CSR_MPM_RESERVED : read_data_r = '0;
`CSR_MPM_RESERVED_H : read_data_r = '0;
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
`CSR_MSTATUS : read_data_r = 32'(csr_mstatus);
`CSR_MISA : read_data_r = `ISA_CODE;
`CSR_MEDELEG : read_data_r = 32'(csr_medeleg);
`CSR_MIDELEG : read_data_r = 32'(csr_mideleg);
`CSR_MIE : read_data_r = 32'(csr_mie);
`CSR_MTVEC : read_data_r = 32'(csr_mtvec);
`CSR_MEPC : read_data_r = 32'(csr_mepc);
`CSR_PMPCFG0 : read_data_r = 32'(csr_pmpcfg[0]);
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;
`CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID;
default: begin
if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin
read_addr_valid_r = 1;
end else
`ifdef EXT_TEX_ENABLE
if ((read_addr == `CSR_TEX_UNIT)
|| (read_addr >= `CSR_TEX_STATE_BEGIN
&& read_addr < `CSR_TEX_STATE_END)) begin
read_addr_valid_r = 1;
end else
`endif
read_addr_valid_r = 0;
end
endcase
end
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid))
assign read_data = read_data_r;
`ifdef EXT_F_ENABLE
assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS];
`endif
endmodule

View File

@@ -1,151 +0,0 @@
`include "VX_define.vh"
module VX_csr_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
`ifdef EXT_TEX_ENABLE
VX_perf_tex_if.slave perf_tex_if,
`endif
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
VX_cmt_to_csr_if.slave cmt_to_csr_if,
VX_fetch_to_csr_if.slave fetch_to_csr_if,
VX_csr_req_if.slave csr_req_if,
VX_commit_if.master csr_commit_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if,
input wire[`NUM_WARPS-1:0] fpu_pending,
`endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
output wire[`NUM_WARPS-1:0] pending,
input wire busy
);
wire csr_we_s1;
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
wire [31:0] csr_read_data;
wire [31:0] csr_read_data_s1;
wire [31:0] csr_updated_data_s1;
wire write_enable = csr_commit_if.valid && csr_we_s1;
wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.imm) : csr_req_if.rs1_data;
VX_csr_data #(
.CORE_ID(CORE_ID)
) csr_data (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
`ifdef EXT_TEX_ENABLE
.perf_tex_if (perf_tex_if),
`endif
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif
.read_enable (csr_req_if.valid),
.read_uuid (csr_req_if.uuid),
.read_addr (csr_req_if.addr),
.read_wid (csr_req_if.wid),
.read_data (csr_read_data),
.write_enable (write_enable),
.write_uuid (csr_commit_if.uuid),
.write_addr (csr_addr_s1),
.write_wid (csr_commit_if.wid),
.write_data (csr_updated_data_s1),
.busy (busy)
);
wire write_hazard = (csr_addr_s1 == csr_req_if.addr)
&& (csr_commit_if.wid == csr_req_if.wid)
&& csr_commit_if.valid;
wire [31:0] csr_read_data_qual = write_hazard ? csr_updated_data_s1 : csr_read_data;
reg [31:0] csr_updated_data;
reg csr_we_s0_unqual;
always @(*) begin
csr_we_s0_unqual = (csr_req_data != 0);
case (csr_req_if.op_type)
`INST_CSR_RW: begin
csr_updated_data = csr_req_data;
csr_we_s0_unqual = 1;
end
`INST_CSR_RS: begin
csr_updated_data = csr_read_data_qual | csr_req_data;
end
//`INST_CSR_RC
default: begin
csr_updated_data = csr_read_data_qual & ~csr_req_data;
end
endcase
end
`ifdef EXT_F_ENABLE
wire stall_in = fpu_pending[csr_req_if.wid];
`else
wire stall_in = 0;
`endif
wire csr_req_valid = csr_req_if.valid && !stall_in;
wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
.data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign csr_commit_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
(csr_addr_s1 == `CSR_LTID
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1;
end
assign csr_commit_if.eop = 1'b1;
// can accept new request?
assign csr_req_if.ready = ~(stall_out || stall_in);
// pending request
reg [`NUM_WARPS-1:0] pending_r;
always @(posedge clk) begin
if (reset) begin
pending_r <= 0;
end else begin
if (csr_commit_if.valid && csr_commit_if.ready) begin
pending_r[csr_commit_if.wid] <= 0;
end
if (csr_req_if.valid && csr_req_if.ready) begin
pending_r[csr_req_if.wid] <= 1;
end
end
end
assign pending = pending_r;
endmodule

View File

@@ -1,495 +0,0 @@
`include "VX_define.vh"
`ifdef DBG_TRACE_CORE_PIPELINE
`include "VX_trace_instr.vh"
`endif
`ifdef EXT_F_ENABLE
`define USED_IREG(r) \
r``_r = {1'b0, ``r}
`define USED_FREG(r) \
r``_r = {1'b1, ``r}
`else
`define USED_IREG(r) \
r``_r = ``r
`endif
module VX_decode #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_if.decode perf_decode_if,
`endif
// inputs
VX_ifetch_rsp_if.slave ifetch_rsp_if,
// outputs
VX_decode_if.master decode_if,
VX_wstall_if.master wstall_if,
VX_join_if.master join_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [31:0] imm;
reg use_rd, use_PC, use_imm;
reg is_join, is_wstall;
wire [31:0] instr = ifetch_rsp_if.data;
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20];
wire [4:0] rd = instr[11:7];
wire [4:0] rs1 = instr[19:15];
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
wire [19:0] upper_imm = {func7, rs2, rs1, func3};
wire [11:0] alu_imm = (func3[0] && ~func3[1]) ? {{7{1'b0}}, rs2} : u_12;
wire [11:0] s_imm = {func7, rd};
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
`UNUSED_VAR (rs3)
always @(*) begin
ex_type = 0;
op_type = 'x;
op_mod = 0;
rd_r = 0;
rs1_r = 0;
rs2_r = 0;
rs3_r = 0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
is_join = 0;
is_wstall = 0;
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
case (func3)
3'h0: op_type = `INST_OP_BITS'(`INST_ALU_ADD);
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
default:;
endcase
use_rd = 1;
use_imm = 1;
imm = {{20{alu_imm[11]}}, alu_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_F_ENABLE
if (func7[0]) begin
case (func3)
3'h0: op_type = `INST_OP_BITS'(`INST_MUL_MUL);
3'h1: op_type = `INST_OP_BITS'(`INST_MUL_MULH);
3'h2: op_type = `INST_OP_BITS'(`INST_MUL_MULHSU);
3'h3: op_type = `INST_OP_BITS'(`INST_MUL_MULHU);
3'h4: op_type = `INST_OP_BITS'(`INST_MUL_DIV);
3'h5: op_type = `INST_OP_BITS'(`INST_MUL_DIVU);
3'h6: op_type = `INST_OP_BITS'(`INST_MUL_REM);
3'h7: op_type = `INST_OP_BITS'(`INST_MUL_REMU);
default:;
endcase
op_mod = 2;
end else
`endif
begin
case (func3)
3'h0: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SUB) : `INST_OP_BITS'(`INST_ALU_ADD);
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
default:;
endcase
end
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
use_rd = 1;
use_imm = 1;
imm = {upper_imm, 12'(0)};
`USED_IREG (rd);
rs1_r = 0;
end
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
use_imm = 1;
use_PC = 1;
imm = {upper_imm, 12'(0)};
`USED_IREG (rd);
end
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{11{jal_imm[20]}}, jal_imm};
`USED_IREG (rd);
end
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod = 1;
use_rd = 1;
use_imm = 1;
is_wstall = 1;
imm = {{20{u_12[11]}}, u_12};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_B: begin
ex_type = `EX_ALU;
case (func3)
3'h0: op_type = `INST_OP_BITS'(`INST_BR_EQ);
3'h1: op_type = `INST_OP_BITS'(`INST_BR_NE);
3'h4: op_type = `INST_OP_BITS'(`INST_BR_LT);
3'h5: op_type = `INST_OP_BITS'(`INST_BR_GE);
3'h6: op_type = `INST_OP_BITS'(`INST_BR_LTU);
3'h7: op_type = `INST_OP_BITS'(`INST_BR_GEU);
default:;
endcase
op_mod = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{19{b_imm[12]}}, b_imm};
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`INST_FENCE: begin
ex_type = `EX_LSU;
op_mod = `INST_MOD_BITS'(1);
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_CSR;
op_type = `INST_OP_BITS'(func3[1:0]);
use_rd = 1;
use_imm = func3[2];
imm[`CSR_ADDR_BITS-1:0] = u_12; // addr
`USED_IREG (rd);
if (func3[2]) begin
imm[`CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
end else begin
`USED_IREG (rs1);
end
end else begin
ex_type = `EX_ALU;
case (u_12)
12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET);
default:;
endcase
op_mod = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = 32'd4;
`USED_IREG (rd);
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`endif
`INST_L: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3});
use_rd = 1;
imm = {{20{u_12[11]}}, u_12};
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rd);
end else
`endif
`USED_IREG (rd);
`USED_IREG (rs1);
end
`ifdef EXT_F_ENABLE
`INST_FS,
`endif
`INST_S: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{20{s_imm[11]}}, s_imm};
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rs2);
end else
`endif
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'(opcode[3:0]);
op_mod = func3;
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
`USED_FREG (rs3);
end
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = func3;
use_rd = 1;
case (func7)
7'h00, // FADD
7'h04, // FSUB
7'h08, // FMUL
7'h0C: begin // FDIV
op_type = `INST_OP_BITS'(func7[3:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h2C: begin
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
7'h50: begin
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h60: begin
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTWUS) : `INST_OP_BITS'(`INST_FPU_CVTWS);
`USED_IREG (rd);
`USED_FREG (rs1);
end
7'h68: begin
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTSWU) : `INST_OP_BITS'(`INST_FPU_CVTSW);
`USED_FREG (rd);
`USED_IREG (rs1);
end
7'h10: begin
// FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = {1'b0, func3[1:0]};
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h14: begin
// FMIN=3, FMAX=4
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 4 : 3;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
7'h70: begin
if (func3[0]) begin
// FCLASS
op_type = `INST_OP_BITS'(`INST_FPU_CLASS);
end else begin
// FMV.X.W=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
end
`USED_IREG (rd);
`USED_FREG (rs1);
end
7'h78: begin
// FMV.W.X=6
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 6;
`USED_FREG (rd);
`USED_IREG (rs1);
end
default:;
endcase
end
`endif
`INST_GPGPU: begin
ex_type = `EX_GPU;
case (func3)
3'h0: begin
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h1: begin
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h3: begin
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
is_join = 1;
end
3'h4: begin
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
is_wstall = 1;
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h5: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
end
default:;
endcase
end
`INST_GPU: begin
case (func3)
`ifdef EXT_TEX_ENABLE
3'h0: begin
ex_type = `EX_GPU;
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
op_mod = `INST_MOD_BITS'(func2);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
`endif
default:;
endcase
end
default:;
endcase
end
`UNUSED_VAR (func2)
// disable write to integer register r0
wire wb = use_rd && (| rd_r);
assign decode_if.valid = ifetch_rsp_if.valid;
assign decode_if.uuid = ifetch_rsp_if.uuid;
assign decode_if.wid = ifetch_rsp_if.wid;
assign decode_if.tmask = ifetch_rsp_if.tmask;
assign decode_if.PC = ifetch_rsp_if.PC;
assign decode_if.ex_type = ex_type;
assign decode_if.op_type = op_type;
assign decode_if.op_mod = op_mod;
assign decode_if.wb = wb;
assign decode_if.rd = rd_r;
assign decode_if.rs1 = rs1_r;
assign decode_if.rs2 = rs2_r;
assign decode_if.rs3 = rs3_r;
assign decode_if.imm = imm;
assign decode_if.use_PC = use_PC;
assign decode_if.use_imm = use_imm;
///////////////////////////////////////////////////////////////////////////
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
assign join_if.valid = ifetch_rsp_fire && is_join;
assign join_if.wid = ifetch_rsp_if.wid;
assign wstall_if.valid = ifetch_rsp_fire;
assign wstall_if.wid = ifetch_rsp_if.wid;
assign wstall_if.stalled = is_wstall;
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef PERF_ENABLE
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask);
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask);
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask);
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
reg [`PERF_CTR_BITS-1:0] perf_branches;
always @(posedge clk) begin
if (reset) begin
perf_loads <= 0;
perf_stores <= 0;
perf_branches <= 0;
end else begin
if (decode_if.valid && decode_if.ready) begin
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
end
end
end
assign perf_decode_if.loads = perf_loads;
assign perf_decode_if.stores = perf_stores;
assign perf_decode_if.branches = perf_branches;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
trace_ex_type(decode_if.ex_type);
dpi_trace(", op=");
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n",
decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid);
end
end
`endif
endmodule

View File

@@ -1,416 +1,425 @@
`ifndef VX_DEFINE
`define VX_DEFINE
`include "VX_platform.vh"
`include "VX_config.vh"
///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `LOG2UP(`NUM_WARPS)
`define NT_BITS `LOG2UP(`NUM_THREADS)
`define NC_BITS `LOG2UP(`NUM_CORES)
`define NB_BITS `LOG2UP(`NUM_BARRIERS)
`define NUM_IREGS 32
`define NRI_BITS `LOG2UP(`NUM_IREGS)
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`else
`define NUM_REGS `NUM_IREGS
`endif
`define NR_BITS `LOG2UP(`NUM_REGS)
`define CSR_ADDR_BITS 12
`define CSR_WIDTH 12
`define PERF_CTR_BITS 44
`define UUID_BITS 44
///////////////////////////////////////////////////////////////////////////////
`define EX_NOP 3'h0
`define EX_ALU 3'h1
`define EX_LSU 3'h2
`define EX_CSR 3'h3
`define EX_FPU 3'h4
`define EX_GPU 3'h5
`define EX_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111
`define INST_JALR 7'b1100111
`define INST_B 7'b1100011 // branch instructions
`define INST_L 7'b0000011 // load instructions
`define INST_S 7'b0100011 // store instructions
`define INST_I 7'b0010011 // immediate instructions
`define INST_R 7'b0110011 // register instructions
`define INST_FENCE 7'b0001111 // Fence instructions
`define INST_SYS 7'b1110011 // system instructions
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
`define INST_GPGPU 7'b1101011
`define INST_GPU 7'b1011011
`define INST_TEX 7'b0101011
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_FRM_RDN 3'b010 // round to -inf
`define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_SUB 4'b1011
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_ALU_BITS 4
`define INST_ALU_OP(x) x[`INST_ALU_BITS-1:0]
`define INST_ALU_OP_CLASS(x) x[3:2]
`define INST_ALU_SIGNED(x) x[0]
`define INST_ALU_IS_BR(x) x[0]
`define INST_ALU_IS_MUL(x) x[1]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011
`define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_BR_MRET 4'b1110
`define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4
`define INST_BR_NEG(x) x[1]
`define INST_BR_LESS(x) x[2]
`define INST_BR_STATIC(x) x[3]
`define INST_MUL_MUL 3'h0
`define INST_MUL_MULH 3'h1
`define INST_MUL_MULHSU 3'h2
`define INST_MUL_MULHU 3'h3
`define INST_MUL_DIV 3'h4
`define INST_MUL_DIVU 3'h5
`define INST_MUL_REM 3'h6
`define INST_MUL_REMU 3'h7
`define INST_MUL_BITS 3
`define INST_MUL_IS_DIV(x) x[2]
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010
`define INST_FMT_BU 3'b100
`define INST_FMT_HU 3'b101
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_BITS 4
`define INST_LSU_FMT(x) x[2:0]
`define INST_LSU_WSIZE(x) x[1:0]
`define INST_LSU_IS_MEM(x) (3'h0 == x)
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_CSR_RW 2'h1
`define INST_CSR_RS 2'h2
`define INST_CSR_RC 2'h3
`define INST_CSR_OTHER 2'h0
`define INST_CSR_BITS 2
`define INST_FPU_ADD 4'h0
`define INST_FPU_SUB 4'h4
`define INST_FPU_MUL 4'h8
`define INST_FPU_DIV 4'hC
`define INST_FPU_CVTWS 4'h1 // FCVT.W.S
`define INST_FPU_CVTWUS 4'h5 // FCVT.WU.S
`define INST_FPU_CVTSW 4'h9 // FCVT.S.W
`define INST_FPU_CVTSWU 4'hD // FCVT.S.WU
`define INST_FPU_SQRT 4'h2
`define INST_FPU_CLASS 4'h6
`define INST_FPU_CMP 4'hA
`define INST_FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
`define INST_FPU_MADD 4'h3
`define INST_FPU_MSUB 4'h7
`define INST_FPU_NMSUB 4'hB
`define INST_FPU_NMADD 4'hF
`define INST_FPU_BITS 4
`define INST_GPU_TMC 4'h0
`define INST_GPU_WSPAWN 4'h1
`define INST_GPU_SPLIT 4'h2
`define INST_GPU_JOIN 4'h3
`define INST_GPU_BAR 4'h4
`define INST_GPU_PRED 4'h5
`define INST_GPU_TEX 4'h6
`define INST_GPU_BITS 4
///////////////////////////////////////////////////////////////////////////////
`ifdef EXT_M_ENABLE
`define ISA_EXT_M (1 << 12)
`else
`define ISA_EXT_M 0
`endif
`ifdef EXT_F_ENABLE
`define ISA_EXT_F (1 << 5)
`else
`define ISA_EXT_F 0
`endif
`define ISA_CODE (0 << 0) // A - Atomic Instructions extension \
| (0 << 1) // B - Tentatively reserved for Bit operations extension \
| (0 << 2) // C - Compressed extension \
| (0 << 3) // D - Double precsision floating-point extension \
| (0 << 4) // E - RV32E base ISA \
|`ISA_EXT_F // F - Single precsision floating-point extension \
| (0 << 6) // G - Additional standard extensions present \
| (0 << 7) // H - Hypervisor mode implemented \
| (1 << 8) // I - RV32I/64I/128I base ISA \
| (0 << 9) // J - Reserved \
| (0 << 10) // K - Reserved \
| (0 << 11) // L - Tentatively reserved for Bit operations extension \
|`ISA_EXT_M // M - Integer Multiply/Divide extension \
| (0 << 13) // N - User level interrupts supported \
| (0 << 14) // O - Reserved \
| (0 << 15) // P - Tentatively reserved for Packed-SIMD extension \
| (0 << 16) // Q - Quad-precision floating-point extension \
| (0 << 17) // R - Reserved \
| (0 << 18) // S - Supervisor mode implemented \
| (0 << 19) // T - Tentatively reserved for Transactional Memory extension \
| (1 << 20) // U - User mode implemented \
| (0 << 21) // V - Tentatively reserved for Vector extension \
| (0 << 22) // W - Reserved \
| (1 << 23) // X - Non-standard extensions present \
| (0 << 24) // Y - Reserved \
| (0 << 25) // Z - Reserved
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BIT 1
// texture tag bits
`define TEX_TAG_BIT 1
// cache address type bits
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE)
////////////////////////// Icache Configurable Knobs //////////////////////////
// Cache ID
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
// Word size in bytes
`define ICACHE_WORD_SIZE 4
// Block size in bytes
`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE
// TAG sharing enable
`define ICACHE_CORE_TAG_ID_BITS `NW_BITS
// Core request tag bits
`define ICACHE_CORE_TAG_WIDTH (`UUID_BITS + `ICACHE_CORE_TAG_ID_BITS)
// Memory request data bits
`define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
// Memory request address bits
`define ICACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`ICACHE_LINE_SIZE))
// Memory request tag bits
`define ICACHE_MEM_TAG_WIDTH `CLOG2(`ICACHE_MSHR_SIZE)
////////////////////////// Dcache Configurable Knobs //////////////////////////
// Cache ID
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
// Word size in bytes
`define DCACHE_WORD_SIZE 4
// Block size in bytes
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
// Core request tag bits
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
`ifdef EXT_TEX_ENABLE
`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2)
`define LSU_TEX_DCACHE_TAG_BITS (`UUID_BITS + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT)
`else
`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
`endif
`define DCACHE_CORE_TAG_WIDTH (`UUID_BITS + `DCACHE_CORE_TAG_ID_BITS)
// Memory request data bits
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
// Memory request address bits
`define DCACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
// Memory byte enable bits
`define DCACHE_MEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
// Input request size
`define DCACHE_NUM_REQS `NUM_THREADS
// Memory request tag bits
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH)
// Merged D-cache/I-cache memory tag
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
////////////////////////// SM Configurable Knobs //////////////////////////////
// Cache ID
`define SMEM_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
// Word size in bytes
`define SMEM_WORD_SIZE 4
// bank address offset
`define SMEM_BANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SMEM_WORD_SIZE)
// Input request size
`define SMEM_NUM_REQS `NUM_THREADS
////////////////////////// L2cache Configurable Knobs /////////////////////////
// Cache ID
`define L2_CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
// Word size in bytes
`define L2_WORD_SIZE `DCACHE_LINE_SIZE
// Block size in bytes
`define L2_CACHE_LINE_SIZE ((`L2_ENABLE) ? `MEM_BLOCK_SIZE : `L2_WORD_SIZE)
// Input request tag bits
`define L2_CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
// Memory request data bits
`define L2_MEM_DATA_WIDTH (`L2_CACHE_LINE_SIZE * 8)
// Memory request address bits
`define L2_MEM_ADDR_WIDTH (32 - `CLOG2(`L2_CACHE_LINE_SIZE))
// Memory byte enable bits
`define L2_MEM_BYTEEN_WIDTH `L2_CACHE_LINE_SIZE
// Input request size
`define L2_NUM_REQS `NUM_CORES
// Memory request tag bits
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH)
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
////////////////////////// L3cache Configurable Knobs /////////////////////////
// Cache ID
`define L3_CACHE_ID 0
// Word size in bytes
`define L3_WORD_SIZE `L2_CACHE_LINE_SIZE
// Block size in bytes
`define L3_CACHE_LINE_SIZE ((`L3_ENABLE) ? `MEM_BLOCK_SIZE : `L3_WORD_SIZE)
// Input request tag bits
`define L3_CORE_TAG_WIDTH (`L2_CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
// Memory request data bits
`define L3_MEM_DATA_WIDTH (`L3_CACHE_LINE_SIZE * 8)
// Memory request address bits
`define L3_MEM_ADDR_WIDTH (32 - `CLOG2(`L3_CACHE_LINE_SIZE))
// Memory byte enable bits
`define L3_MEM_BYTEEN_WIDTH `L3_CACHE_LINE_SIZE
// Input request size
`define L3_NUM_REQS `NUM_CLUSTERS
// Memory request tag bits
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH)
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
///////////////////////////////////////////////////////////////////////////////
`define VX_MEM_BYTEEN_WIDTH `L3_MEM_BYTEEN_WIDTH
`define VX_MEM_ADDR_WIDTH `L3_MEM_ADDR_WIDTH
`define VX_MEM_DATA_WIDTH `L3_MEM_DATA_WIDTH
`define VX_MEM_TAG_WIDTH `L3_MEM_TAG_WIDTH
`define VX_CORE_TAG_WIDTH `L3_CORE_TAG_WIDTH
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`include "VX_fpu_types.vh"
`include "VX_gpu_types.vh"
`endif
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_DEFINE_VH
`define VX_DEFINE_VH
`include "VX_platform.vh"
`include "VX_config.vh"
`include "VX_types.vh"
///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `CLOG2(`NUM_WARPS)
`define NC_WIDTH `UP(`NC_BITS)
`define NT_BITS `CLOG2(`NUM_THREADS)
`define NW_WIDTH `UP(`NW_BITS)
`define NC_BITS `CLOG2(`NUM_CORES)
`define NT_WIDTH `UP(`NT_BITS)
`define NB_BITS `CLOG2(`NUM_BARRIERS)
`define NB_WIDTH `UP(`NB_BITS)
`define NUM_IREGS 32
`define NRI_BITS `CLOG2(`NUM_IREGS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`else
`define NUM_REGS `NUM_IREGS
`endif
`define NR_BITS `CLOG2(`NUM_REGS)
`define PERF_CTR_BITS 44
`ifndef NDEBUG
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
`define EX_LSU 1
`define EX_SFU 2
`define EX_FPU 3
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111
`define INST_JALR 7'b1100111
`define INST_B 7'b1100011 // branch instructions
`define INST_L 7'b0000011 // load instructions
`define INST_S 7'b0100011 // store instructions
`define INST_I 7'b0010011 // immediate instructions
`define INST_R 7'b0110011 // register instructions
`define INST_FENCE 7'b0001111 // Fence instructions
`define INST_SYS 7'b1110011 // system instructions
// RV64I instruction specific opcodes (for any W instruction)
`define INST_I_W 7'b0011011 // W type immediate instructions
`define INST_R_W 7'b0111011 // W type register instructions
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_FRM_RDN 3'b010 // round to -inf
`define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
`define INST_FMT_BITS 2
///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
`define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_ALU_BITS 4
`define INST_ALU_CLASS(op) op[3:2]
`define INST_ALU_SIGNED(op) op[0]
`define INST_ALU_IS_SUB(op) op[1]
`define INST_ALU_IS_BR(mod) mod[0]
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011
`define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_BR_MRET 4'b1110
`define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
`define INST_BR_IS_NEG(op) op[1]
`define INST_BR_IS_LESS(op) op[2]
`define INST_BR_IS_STATIC(op) op[3]
`define INST_M_MUL 3'b000
`define INST_M_MULHU 3'b001
`define INST_M_MULH 3'b010
`define INST_M_MULHSU 3'b011
`define INST_M_DIV 3'b100
`define INST_M_DIVU 3'b101
`define INST_M_REM 3'b110
`define INST_M_REMU 3'b111
`define INST_M_BITS 3
`define INST_M_SIGNED(op) (~op[0])
`define INST_M_IS_MULX(op) (~op[2])
`define INST_M_IS_MULH(op) (op[1:0] != 0)
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
`define INST_M_IS_REM(op) op[1]
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010
`define INST_FMT_D 3'b011
`define INST_FMT_BU 3'b100
`define INST_FMT_HU 3'b101
`define INST_FMT_WU 3'b110
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_SD 4'b1011 // new for RV64I SD
`define INST_LSU_FENCE 4'b1111
`define INST_LSU_BITS 4
`define INST_LSU_FMT(op) op[2:0]
`define INST_LSU_WSIZE(op) op[1:0]
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4])
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_SPLIT 4'h2
`define INST_SFU_JOIN 4'h3
`define INST_SFU_BAR 4'h4
`define INST_SFU_PRED 4'h5
`define INST_SFU_CSRRW 4'h6
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
`define INST_SFU_TEX 4'h9
`define INST_SFU_RASTER 4'hA
`define INST_SFU_ROP 4'hB
`define INST_SFU_CMOV 4'hC
`define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5)
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
///////////////////////////////////////////////////////////////////////////////
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
///////////////////////////////////////////////////////////////////////////////
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
`define VX_DCR_DATA_WIDTH 32
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define BUFFER_BUSY(dst, src, enable) \
logic __busy; \
if (enable) begin \
always @(posedge clk) begin \
if (reset) begin \
__busy <= 1'b0; \
end else begin \
__busy <= src; \
end \
end \
end else begin \
assign __busy = src; \
end \
assign dst = __busy
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define PERF_REDUCE(dst, src, field, width, count) \
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
reg [width-1:0] __reduce_add_r_``dst``field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
); \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
end \
end \
assign ``dst.``field = __reduce_add_r_``dst``field
`define PERF_CACHE_ADD(dst, src, count) \
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
end else begin \
assign dst = `NW_WIDTH'(block_idx); \
end \
end else begin \
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) \
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View File

@@ -1,159 +0,0 @@
`include "VX_define.vh"
module VX_dispatch (
input wire clk,
input wire reset,
// inputs
VX_ibuffer_if.slave ibuffer_if,
VX_gpr_rsp_if.slave gpr_rsp_if,
// outputs
VX_alu_req_if.master alu_req_if,
VX_lsu_req_if.master lsu_req_if,
VX_csr_req_if.master csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.master fpu_req_if,
`endif
VX_gpu_req_if.master gpu_req_if
);
wire [`NT_BITS-1:0] tid;
wire alu_req_ready;
wire lsu_req_ready;
wire csr_req_ready;
`ifdef EXT_F_ENABLE
wire fpu_req_ready;
`endif
wire gpu_req_ready;
VX_lzc #(
.N (`NUM_THREADS)
) tid_select (
.in_i (ibuffer_if.tmask),
.cnt_o (tid),
`UNUSED_PIN (valid_o)
);
wire [31:0] next_PC = ibuffer_if.PC + 4;
// ALU unit
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.OUT_REG (1)
) alu_buffer (
.clk (clk),
.reset (reset),
.valid_in (alu_req_valid),
.ready_in (alu_req_ready),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.valid_out (alu_req_if.valid),
.ready_out (alu_req_if.ready)
);
// lsu unit
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
.OUT_REG (1)
) lsu_buffer (
.clk (clk),
.reset (reset),
.valid_in (lsu_req_valid),
.ready_in (lsu_req_ready),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
.data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
.valid_out (lsu_req_if.valid),
.ready_out (lsu_req_if.ready)
);
// csr unit
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(ibuffer_if.op_type);
wire [`CSR_ADDR_BITS-1:0] csr_addr = ibuffer_if.imm[`CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = ibuffer_if.imm[`CSR_ADDR_BITS +: `NRI_BITS];
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
VX_skid_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
.OUT_REG (1)
) csr_buffer (
.clk (clk),
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
.data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
.valid_out (csr_req_if.valid),
.ready_out (csr_req_if.ready)
);
// fpu unit
`ifdef EXT_F_ENABLE
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUT_REG (1)
) fpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_valid),
.ready_in (fpu_req_ready),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.valid_out (fpu_req_if.valid),
.ready_out (fpu_req_if.ready)
);
`else
`UNUSED_VAR (gpr_rsp_if.rs3_data)
`endif
// gpu unit
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
.OUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
.valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready)
);
// can take next request?
reg ready_r;
always @(*) begin
case (ibuffer_if.ex_type)
`EX_ALU: ready_r = alu_req_ready;
`EX_LSU: ready_r = lsu_req_ready;
`EX_CSR: ready_r = csr_req_ready;
`ifdef EXT_F_ENABLE
`EX_FPU: ready_r = fpu_req_ready;
`endif
`EX_GPU: ready_r = gpu_req_ready;
default: ready_r = 1'b1; // ignore NOPs
endcase
end
assign ibuffer_if.ready = ready_r;
endmodule

View File

@@ -1,237 +0,0 @@
`include "VX_define.vh"
module VX_execute #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_execute
input wire clk,
input wire reset,
// Dcache interface
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// commit interface
VX_cmt_to_csr_if.slave cmt_to_csr_if,
// fetch interface
VX_fetch_to_csr_if.slave fetch_to_csr_if,
`ifdef PERF_ENABLE
VX_perf_memsys_if.slave perf_memsys_if,
VX_perf_pipeline_if.slave perf_pipeline_if,
`endif
// inputs
VX_alu_req_if.slave alu_req_if,
VX_lsu_req_if.slave lsu_req_if,
VX_csr_req_if.slave csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.slave fpu_req_if,
`endif
VX_gpu_req_if.slave gpu_req_if,
// outputs
VX_branch_ctl_if.master branch_ctl_if,
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master alu_commit_if,
VX_commit_if.master ld_commit_if,
VX_commit_if.master st_commit_if,
VX_commit_if.master csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.master fpu_commit_if,
`endif
VX_commit_if.master gpu_commit_if,
input wire busy
);
`ifdef EXT_TEX_ENABLE
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) lsu_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) lsu_dcache_rsp_if();
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) tex_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
) tex_dcache_rsp_if();
VX_tex_csr_if tex_csr_if();
`ifdef PERF_ENABLE
VX_perf_tex_if perf_tex_if();
`endif
VX_cache_arb #(
.NUM_REQS (2),
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
.TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE)
) tex_lsu_arb (
.clk (clk),
.reset (reset),
// Tex/LSU request
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
.req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}),
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
// Dcache request
.req_valid_out (dcache_req_if.valid),
.req_rw_out (dcache_req_if.rw),
.req_byteen_out (dcache_req_if.byteen),
.req_addr_out (dcache_req_if.addr),
.req_data_out (dcache_req_if.data),
.req_tag_out (dcache_req_if.tag),
.req_ready_out (dcache_req_if.ready),
// Dcache response
.rsp_valid_in (dcache_rsp_if.valid),
.rsp_tmask_in (dcache_rsp_if.tmask),
.rsp_tag_in (dcache_rsp_if.tag),
.rsp_data_in (dcache_rsp_if.data),
.rsp_ready_in (dcache_rsp_if.ready),
// Tex/LSU response
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
.rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}),
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
);
`endif
`ifdef EXT_F_ENABLE
wire [`NUM_WARPS-1:0] csr_pending;
wire [`NUM_WARPS-1:0] fpu_pending;
VX_fpu_to_csr_if fpu_to_csr_if();
`endif
`RESET_RELAY (alu_reset);
`RESET_RELAY (lsu_reset);
`RESET_RELAY (csr_reset);
`RESET_RELAY (gpu_reset);
VX_alu_unit #(
.CORE_ID(CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
.alu_req_if (alu_req_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if)
);
VX_lsu_unit #(
.CORE_ID(CORE_ID)
) lsu_unit (
`SCOPE_BIND_VX_execute_lsu_unit
.clk (clk),
.reset (lsu_reset),
`ifdef EXT_TEX_ENABLE
.dcache_req_if (lsu_dcache_req_if),
.dcache_rsp_if (lsu_dcache_rsp_if),
`else
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
`endif
.lsu_req_if (lsu_req_if),
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if)
);
VX_csr_unit #(
.CORE_ID(CORE_ID)
) csr_unit (
.clk (clk),
.reset (csr_reset),
`ifdef PERF_ENABLE
`ifdef EXT_TEX_ENABLE
.perf_tex_if (perf_tex_if),
`endif
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_pending (fpu_pending),
.pending (csr_pending),
`else
`UNUSED_PIN (pending),
`endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif
.busy (busy)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset);
VX_fpu_unit #(
.CORE_ID(CORE_ID)
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.fpu_req_if (fpu_req_if),
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_commit_if (fpu_commit_if),
.csr_pending (csr_pending),
.pending (fpu_pending)
);
`endif
VX_gpu_unit #(
.CORE_ID(CORE_ID)
) gpu_unit (
`SCOPE_BIND_VX_execute_gpu_unit
.clk (clk),
.reset (gpu_reset),
.gpu_req_if (gpu_req_if),
`ifdef EXT_TEX_ENABLE
`ifdef PERF_ENABLE
.perf_tex_if (perf_tex_if),
`endif
.tex_csr_if (tex_csr_if),
.dcache_req_if (tex_dcache_req_if),
.dcache_rsp_if (tex_dcache_rsp_if),
`endif
.warp_ctl_if (warp_ctl_if),
.gpu_commit_if (gpu_commit_if)
);
// special workaround to get RISC-V tests Pass/Fail status
wire ebreak /* verilator public */;
assign ebreak = alu_req_if.valid && alu_req_if.ready
&& `INST_ALU_IS_BR(alu_req_if.op_mod)
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
endmodule

View File

@@ -1,68 +0,0 @@
`include "VX_define.vh"
module VX_fetch #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_fetch
input wire clk,
input wire reset,
// Icache interface
VX_icache_req_if.master icache_req_if,
VX_icache_rsp_if.slave icache_rsp_if,
// inputs
VX_wstall_if.slave wstall_if,
VX_join_if.slave join_if,
VX_branch_ctl_if.slave branch_ctl_if,
VX_warp_ctl_if.slave warp_ctl_if,
// outputs
VX_ifetch_rsp_if.master ifetch_rsp_if,
// csr interface
VX_fetch_to_csr_if.master fetch_to_csr_if,
// busy status
output wire busy
);
VX_ifetch_req_if ifetch_req_if();
VX_warp_sched #(
.CORE_ID(CORE_ID)
) warp_sched (
`SCOPE_BIND_VX_fetch_warp_sched
.clk (clk),
.reset (reset),
.warp_ctl_if (warp_ctl_if),
.wstall_if (wstall_if),
.join_if (join_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_req_if (ifetch_req_if),
.fetch_to_csr_if (fetch_to_csr_if),
.busy (busy)
);
VX_icache_stage #(
.CORE_ID(CORE_ID)
) icache_stage (
`SCOPE_BIND_VX_fetch_icache_stage
.clk (clk),
.reset (reset),
.icache_rsp_if (icache_rsp_if),
.icache_req_if (icache_req_if),
.ifetch_req_if (ifetch_req_if),
.ifetch_rsp_if (ifetch_rsp_if)
);
endmodule

View File

@@ -1,219 +0,0 @@
`include "VX_define.vh"
module VX_fpu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_fpu_req_if.slave fpu_req_if,
VX_fpu_to_csr_if.master fpu_to_csr_if,
VX_commit_if.master fpu_commit_if,
input wire[`NUM_WARPS-1:0] csr_pending,
output wire[`NUM_WARPS-1:0] pending
);
import fpu_types::*;
`UNUSED_PARAM (CORE_ID)
localparam FPUQ_BITS = `LOG2UP(`FPUQ_SIZE);
wire ready_in;
wire valid_out;
wire ready_out;
wire [`UUID_BITS-1:0] rsp_uuid;
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_tmask;
wire [31:0] rsp_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire has_fflags;
fflags_t [`NUM_THREADS-1:0] fflags;
wire [`NUM_THREADS-1:0][31:0] result;
wire [FPUQ_BITS-1:0] tag_in, tag_out;
wire fpuq_full;
wire fpuq_push = fpu_req_if.valid && fpu_req_if.ready;
wire fpuq_pop = valid_out && ready_out;
VX_index_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
.SIZE (`FPUQ_SIZE)
) req_metadata (
.clk (clk),
.reset (reset),
.acquire_slot (fpuq_push),
.write_addr (tag_in),
.read_addr (tag_out),
.release_addr (tag_out),
.write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
.read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
.release_slot (fpuq_pop),
.full (fpuq_full),
`UNUSED_PIN (empty)
);
// can accept new request?
assign fpu_req_if.ready = ready_in && ~fpuq_full && !csr_pending[fpu_req_if.wid];
wire valid_in = fpu_req_if.valid && ~fpuq_full && !csr_pending[fpu_req_if.wid];
// resolve dynamic FRM from CSR
assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
wire [`INST_FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `INST_FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
`ifdef FPU_DPI
VX_fpu_dpi #(
.TAGW (FPUQ_BITS)
) fpu_dpi (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (tag_out),
.ready_out (ready_out),
.valid_out (valid_out)
);
`elsif FPU_FPNEW
VX_fpu_fpnew #(
.FMULADD (1),
.FDIVSQRT (1),
.FNONCOMP (1),
.FCONV (1),
.TAGW (FPUQ_BITS)
) fpu_fpnew (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (tag_out),
.ready_out (ready_out),
.valid_out (valid_out)
);
`else
VX_fpu_fpga #(
.TAGW (FPUQ_BITS)
) fpu_fpga (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (tag_out),
.ready_out (ready_out),
.valid_out (valid_out)
);
`endif
reg has_fflags_r;
fflags_t fflags_r;
fflags_t rsp_fflags;
always @(*) begin
rsp_fflags = '0;
for (integer i = 0; i < `NUM_THREADS; i++) begin
if (rsp_tmask[i]) begin
rsp_fflags.NX |= fflags[i].NX;
rsp_fflags.UF |= fflags[i].UF;
rsp_fflags.OF |= fflags[i].OF;
rsp_fflags.DZ |= fflags[i].DZ;
rsp_fflags.NV |= fflags[i].NV;
end
end
end
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
.data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
);
assign fpu_commit_if.eop = 1'b1;
assign ready_out = ~stall_out;
// CSR fflags Update
assign fpu_to_csr_if.write_enable = fpu_commit_if.valid && fpu_commit_if.ready && has_fflags_r;
assign fpu_to_csr_if.write_wid = fpu_commit_if.wid;
assign fpu_to_csr_if.write_fflags = fflags_r;
// pending request
reg [`NUM_WARPS-1:0] pending_r;
always @(posedge clk) begin
if (reset) begin
pending_r <= 0;
end else begin
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
pending_r[fpu_commit_if.wid] <= 0;
end
if (fpu_req_if.valid && fpu_req_if.ready) begin
pending_r[fpu_req_if.wid] <= 1;
end
end
end
assign pending = pending_r;
endmodule

View File

@@ -1,91 +0,0 @@
`include "VX_define.vh"
module VX_gpr_stage #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_writeback_if.slave writeback_if,
VX_gpr_req_if.slave gpr_req_if,
// outputs
VX_gpr_rsp_if.master gpr_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS;
// ensure r0 never gets written, which can happen before the reset
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
wire [`NUM_THREADS-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i] = write_enable && writeback_if.tmask[i];
end
wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2;
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram1 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr1),
.rdata (gpr_rsp_if.rs1_data[i])
);
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram2 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr2),
.rdata (gpr_rsp_if.rs2_data[i])
);
end
`ifdef EXT_F_ENABLE
wire [$clog2(RAM_SIZE)-1:0] raddr3;
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram3 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.raddr (raddr3),
.rdata (gpr_rsp_if.rs3_data[i])
);
end
`else
`UNUSED_VAR (gpr_req_if.rs3)
assign gpr_rsp_if.rs3_data = 'x;
`endif
assign writeback_if.ready = 1'b1;
endmodule

218
hw/rtl/VX_gpu_pkg.sv Normal file
View File

@@ -0,0 +1,218 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_GPU_PKG_VH
`define VX_GPU_PKG_VH
`include "VX_define.vh"
package VX_gpu_pkg;
typedef struct packed {
logic valid;
logic [`NUM_THREADS-1:0] tmask;
} tmc_t;
typedef struct packed {
logic valid;
logic [`NUM_WARPS-1:0] wmask;
logic [`XLEN-1:0] pc;
} wspawn_t;
typedef struct packed {
logic valid;
logic is_dvg;
logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask;
logic [`XLEN-1:0] next_pc;
} split_t;
typedef struct packed {
logic valid;
logic is_dvg;
} join_t;
typedef struct packed {
logic valid;
logic [`NB_WIDTH-1:0] id;
logic is_global;
`ifdef GBAR_ENABLE
logic [`MAX(`NW_WIDTH, `NC_WIDTH)-1:0] size_m1;
`else
logic [`NW_WIDTH-1:0] size_m1;
`endif
} barrier_t;
typedef struct packed {
logic [`XLEN-1:0] startup_addr;
logic [7:0] mpm_class;
} base_dcrs_t;
/* verilator lint_off UNUSED */
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
localparam ICACHE_ARB_TAG_WIDTH = (ICACHE_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
localparam DCACHE_WORD_SIZE = (`XLEN / 8);
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS);
// Memory request size
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
// Batch select bits
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
// Core request tag Id bits
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS);
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
localparam DCACHE_ARB_TAG_WIDTH = (DCACHE_NOSM_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam NUM_L1_OUTPUTS = 2;
/////////////////////////////// L2 Parameters /////////////////////////////
// Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
// Input request size
localparam L2_NUM_REQS = NUM_L1_OUTPUTS;
// Core request tag bits
localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH;
// Memory request data bits
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
// Word size in bytes
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
// Input request size
localparam L3_NUM_REQS = `NUM_CLUSTERS;
// Core request tag bits
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
// Memory request data bits
localparam L3_MEM_DATA_WIDTH = (`L3_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO);
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO));
`IGNORE_UNUSED_BEGIN
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (`ISSUE_WIDTH > 1) begin
wid_to_isw = ISSUE_IDX_W'(wid);
end else begin
wid_to_isw = 0;
end
endfunction
`IGNORE_UNUSED_END
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_IDX_W-1:0] isw
);
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
endfunction
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
input logic [`NW_WIDTH-1:0] wid
);
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
endfunction
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
input logic [`NR_BITS-1:0] rid,
input logic [ISSUE_WIS_W-1:0] wis
);
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
endfunction
endpackage
`endif // VX_GPU_PKG_VH

View File

@@ -1,43 +0,0 @@
`ifndef VX_GPU_TYPES
`define VX_GPU_TYPES
`include "VX_define.vh"
package gpu_types;
typedef struct packed {
logic valid;
logic [`NUM_THREADS-1:0] tmask;
} gpu_tmc_t;
`define GPU_TMC_BITS $bits(gpu_types::gpu_tmc_t)
typedef struct packed {
logic valid;
logic [`NUM_WARPS-1:0] wmask;
logic [31:0] pc;
} gpu_wspawn_t;
`define GPU_WSPAWN_BITS $bits(gpu_types::gpu_wspawn_t)
typedef struct packed {
logic valid;
logic diverged;
logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask;
logic [31:0] pc;
} gpu_split_t;
`define GPU_SPLIT_BITS $bits(gpu_types::gpu_split_t)
typedef struct packed {
logic valid;
logic [`NB_BITS-1:0] id;
logic [`NW_BITS-1:0] size_m1;
} gpu_barrier_t;
`define GPU_BARRIER_BITS $bits(gpu_types::gpu_barrier_t)
endpackage
`endif

View File

@@ -1,220 +0,0 @@
`include "VX_define.vh"
module VX_gpu_unit #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_gpu_unit
input wire clk,
input wire reset,
// Inputs
VX_gpu_req_if.slave gpu_req_if,
`ifdef EXT_TEX_ENABLE
// PERF
`ifdef PERF_ENABLE
VX_perf_tex_if.master perf_tex_if,
`endif
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
VX_tex_csr_if.slave tex_csr_if,
`endif
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master gpu_commit_if
);
import gpu_types::*;
`UNUSED_PARAM (CORE_ID)
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
wire rsp_valid;
wire [`UUID_BITS-1:0] rsp_uuid;
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_tmask;
wire [31:0] rsp_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
gpu_barrier_t barrier;
gpu_split_t split;
wire [WCTL_DATAW-1:0] warp_ctl_data;
wire is_warp_ctl;
wire stall_in, stall_out;
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
wire is_bar = (gpu_req_if.op_type == `INST_GPU_BAR);
wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED);
wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid];
wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid];
wire [`NUM_THREADS-1:0] taken_tmask;
wire [`NUM_THREADS-1:0] not_taken_tmask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire taken = (gpu_req_if.rs1_data[i] != 0);
assign taken_tmask[i] = gpu_req_if.tmask[i] & taken;
assign not_taken_tmask[i] = gpu_req_if.tmask[i] & ~taken;
end
// tmc
wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_req_if.tmask;
assign tmc.valid = is_tmc || is_pred;
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// wspawn
wire [31:0] wspawn_pc = rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < rs1_data);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = wspawn_pc;
// split
assign split.valid = is_split;
assign split.diverged = (| taken_tmask) && (| not_taken_tmask);
assign split.then_tmask = taken_tmask;
assign split.else_tmask = not_taken_tmask;
assign split.pc = gpu_req_if.next_PC;
// barrier
assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1);
// pack warp ctl result
assign warp_ctl_data = {tmc, wspawn, split, barrier};
// texture
`ifdef EXT_TEX_ENABLE
`UNUSED_VAR (gpu_req_if.op_mod)
VX_tex_req_if tex_req_if();
VX_tex_rsp_if tex_rsp_if();
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
assign tex_req_if.uuid = gpu_req_if.uuid;
assign tex_req_if.wid = gpu_req_if.wid;
assign tex_req_if.tmask = gpu_req_if.tmask;
assign tex_req_if.PC = gpu_req_if.PC;
assign tex_req_if.rd = gpu_req_if.rd;
assign tex_req_if.wb = gpu_req_if.wb;
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
assign tex_req_if.lod = gpu_req_if.rs3_data;
VX_tex_unit #(
.CORE_ID(CORE_ID)
) tex_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_tex_if (perf_tex_if),
`endif
.tex_req_if (tex_req_if),
.tex_csr_if (tex_csr_if),
.tex_rsp_if (tex_rsp_if),
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if)
);
assign tex_rsp_if.ready = !stall_out;
assign stall_in = (is_tex && ~tex_req_if.ready)
|| (~is_tex && (tex_rsp_if.valid || stall_out));
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid;
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
assign rsp_rd = tex_rsp_if.rd;
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data);
`else
`UNUSED_VAR (gpu_req_if.op_mod)
`UNUSED_VAR (gpu_req_if.rs3_data)
`UNUSED_VAR (gpu_req_if.wb)
`UNUSED_VAR (gpu_req_if.rd)
assign stall_in = stall_out;
assign is_warp_ctl = 1;
assign rsp_valid = gpu_req_if.valid;
assign rsp_uuid = gpu_req_if.uuid;
assign rsp_wid = gpu_req_if.wid;
assign rsp_tmask = gpu_req_if.tmask;
assign rsp_PC = gpu_req_if.PC;
assign rsp_rd = 0;
assign rsp_wb = 0;
assign rsp_data = RSP_DATAW'(warp_ctl_data);
`endif
wire is_warp_ctl_r;
// output
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
.data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
);
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
assign gpu_commit_if.eop = 1'b1;
// warp control reponse
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0];
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
assign warp_ctl_if.wid = gpu_commit_if.wid;
// can accept new request?
assign gpu_req_if.ready = ~stall_in;
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
`SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid);
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid);
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid);
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid);
`SCOPE_ASSIGN (gpu_rsp_barrier, warp_ctl_if.barrier.valid);
endmodule

View File

@@ -1,210 +0,0 @@
`include "VX_define.vh"
module VX_ibuffer #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_BITS + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
localparam ADDRW = $clog2(`IBUF_SIZE+1);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
reg [`NUM_WARPS-1:0][ADDRW-1:0] used_r;
reg [`NUM_WARPS-1:0] full_r, empty_r, alm_empty_r;
wire [`NUM_WARPS-1:0] q_full, q_empty, q_alm_empty;
wire [DATAW-1:0] q_data_in;
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev;
reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
wire enq_fire = decode_if.valid && decode_if.ready;
wire deq_fire = ibuffer_if.valid && ibuffer_if.ready;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire writing = enq_fire && (i == decode_if.wid);
wire reading = deq_fire && (i == ibuffer_if.wid);
wire going_empty = empty_r[i] || (alm_empty_r[i] && reading);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
) queue (
.clk (clk),
.reset (reset),
.valid_in (writing && !going_empty),
.data_in (q_data_in),
.ready_out(reading),
.data_out (q_data_prev[i]),
`UNUSED_PIN (ready_in),
`UNUSED_PIN (valid_out)
);
always @(posedge clk) begin
if (reset) begin
used_r[i] <= 0;
full_r[i] <= 0;
empty_r[i] <= 1;
alm_empty_r[i] <= 1;
end else begin
if (writing) begin
if (!reading) begin
empty_r[i] <= 0;
if (used_r[i] == 1)
alm_empty_r[i] <= 0;
if (used_r[i] == ADDRW'(`IBUF_SIZE))
full_r[i] <= 1;
end
end else if (reading) begin
full_r[i] <= 0;
if (used_r[i] == ADDRW'(1))
empty_r[i] <= 1;
if (used_r[i] == ADDRW'(2))
alm_empty_r[i] <= 1;
end
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
end
if (writing && going_empty) begin
q_data_out[i] <= q_data_in;
end else if (reading) begin
q_data_out[i] <= q_data_prev[i];
end
end
assign q_full[i] = full_r[i];
assign q_empty[i] = empty_r[i];
assign q_alm_empty[i] = alm_empty_r[i];
end
///////////////////////////////////////////////////////////////////////////
reg [`NUM_WARPS-1:0] valid_table, valid_table_n;
reg [`NW_BITS-1:0] deq_wid, deq_wid_n;
reg [`NW_BITS-1:0] deq_wid_rr, deq_wid_rr_n;
reg deq_valid, deq_valid_n;
reg [DATAW-1:0] deq_instr, deq_instr_n;
reg [NWARPSW-1:0] num_warps;
`UNUSED_VAR (deq_instr)
// calculate valid table
always @(*) begin
valid_table_n = valid_table;
if (deq_fire) begin
valid_table_n[deq_wid] = !q_alm_empty[deq_wid];
end
if (enq_fire) begin
valid_table_n[decode_if.wid] = 1;
end
end
// round-robin warp scheduling
VX_rr_arbiter #(
.NUM_REQS (`NUM_WARPS)
) rr_arbiter (
.clk (clk),
.reset (reset),
.requests (valid_table_n),
.grant_index (deq_wid_rr_n),
`UNUSED_PIN (grant_valid),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (enable)
);
// schedule the next instruction to issue
always @(*) begin
if (num_warps > 1) begin
deq_valid_n = 1;
deq_wid_n = deq_wid_rr;
deq_instr_n = q_data_out[deq_wid_rr];
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end else begin
deq_valid_n = enq_fire;
deq_wid_n = decode_if.wid;
deq_instr_n = q_data_in;
end
end
wire warp_added = enq_fire && q_empty[decode_if.wid];
wire warp_removed = deq_fire && ~(enq_fire && decode_if.wid == deq_wid) && q_alm_empty[deq_wid];
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
deq_valid <= 0;
num_warps <= 0;
end else begin
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
if (warp_added && !warp_removed) begin
num_warps <= num_warps + NWARPSW'(1);
end else if (warp_removed && !warp_added) begin
num_warps <= num_warps - NWARPSW'(1);
end
end
deq_wid <= deq_wid_n;
deq_wid_rr <= deq_wid_rr_n;
deq_instr <= deq_instr_n;
end
assign decode_if.ready = ~q_full[decode_if.wid];
assign q_data_in = {decode_if.uuid,
decode_if.tmask,
decode_if.PC,
decode_if.ex_type,
decode_if.op_type,
decode_if.op_mod,
decode_if.wb,
decode_if.use_PC,
decode_if.use_imm,
decode_if.imm,
decode_if.rd,
decode_if.rs1,
decode_if.rs2,
decode_if.rs3};
assign ibuffer_if.valid = deq_valid;
assign ibuffer_if.wid = deq_wid;
assign {ibuffer_if.uuid,
ibuffer_if.tmask,
ibuffer_if.PC,
ibuffer_if.ex_type,
ibuffer_if.op_type,
ibuffer_if.op_mod,
ibuffer_if.wb,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
ibuffer_if.imm,
ibuffer_if.rd,
ibuffer_if.rs1,
ibuffer_if.rs2,
ibuffer_if.rs3} = deq_instr;
// scoreboard forwarding
assign ibuffer_if.wid_n = deq_wid_n;
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
endmodule

View File

@@ -1,102 +0,0 @@
`include "VX_define.vh"
module VX_icache_stage #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_icache_stage
input wire clk,
input wire reset,
// Icache interface
VX_icache_req_if.master icache_req_if,
VX_icache_rsp_if.slave icache_rsp_if,
// request
VX_ifetch_req_if.slave ifetch_req_if,
// reponse
VX_ifetch_rsp_if.master ifetch_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam OUT_REG = 0;
wire [`NW_BITS-1:0] req_tag, rsp_tag;
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
assign req_tag = ifetch_req_if.wid;
assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
wire [`UUID_BITS-1:0] rsp_uuid;
wire [31:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (32 + `NUM_THREADS + `UUID_BITS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) req_metadata (
.clk (clk),
.wren (icache_req_fire),
.waddr (req_tag),
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask, rsp_uuid})
);
`RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR),
("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid))
// Icache Request
assign icache_req_if.valid = ifetch_req_if.valid;
assign icache_req_if.addr = ifetch_req_if.PC[31:2];
assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag};
// Can accept new request?
assign ifetch_req_if.ready = icache_req_if.ready;
wire [`NW_BITS-1:0] rsp_wid = rsp_tag;
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `UUID_BITS),
.RESETW (1),
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}),
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid})
);
// Can accept new response?
assign icache_rsp_if.ready = ~stall_out;
`SCOPE_ASSIGN (icache_req_fire, icache_req_fire);
`SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid);
`SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0});
`SCOPE_ASSIGN (icache_req_tag, req_tag);
`SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready);
`SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid);
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
`ifdef DBG_TRACE_CORE_ICACHE
always @(posedge clk) begin
if (icache_req_fire) begin
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, ifetch_req_if.uuid);
end
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid);
end
end
`endif
endmodule

View File

@@ -1,68 +0,0 @@
`include "VX_platform.vh"
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1
) (
input wire clk,
input wire reset,
input wire pair,
input wire [WIDTH - 1:0] q1,
input wire [WIDTH - 1:0] q2,
output wire [WIDTH - 1:0] d,
input wire push,
input wire pop,
output wire index,
output wire empty,
output wire full
);
localparam ADDRW = $clog2(DEPTH);
reg is_part [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
wire [WIDTH-1:0] d1, d2;
always @(posedge clk) begin
if (reset) begin
rd_ptr <= 0;
wr_ptr <= 0;
end else begin
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(is_part[rd_ptr]);
rd_ptr <= rd_ptr - ADDRW'(is_part[rd_ptr]);
end
end
end
VX_dp_ram #(
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.LUTRAM (1)
) store (
.clk (clk),
.wren (push),
.waddr (wr_ptr),
.wdata ({q2, q1}),
.raddr (rd_ptr),
.rdata ({d2, d1})
);
always @(posedge clk) begin
if (push) begin
is_part[wr_ptr] <= ~pair;
end else if (pop) begin
is_part[rd_ptr] <= 1;
end
end
assign index = is_part[rd_ptr];
assign d = index ? d1 : d2;
assign empty = (ADDRW'(0) == wr_ptr);
assign full = (ADDRW'(DEPTH-1) == wr_ptr);
endmodule

View File

@@ -1,256 +0,0 @@
`include "VX_define.vh"
module VX_issue #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_issue
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_if.issue perf_issue_if,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_alu_req_if.master alu_req_if,
VX_lsu_req_if.master lsu_req_if,
VX_csr_req_if.master csr_req_if,
`ifdef EXT_F_ENABLE
VX_fpu_req_if.master fpu_req_if,
`endif
VX_gpu_req_if.master gpu_req_if
);
VX_ibuffer_if ibuffer_if();
VX_gpr_req_if gpr_req_if();
VX_gpr_rsp_if gpr_rsp_if();
VX_writeback_if sboard_wb_if();
VX_ibuffer_if scoreboard_if();
VX_ibuffer_if dispatch_if();
// GPR request interface
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
// scoreboard writeback interface
assign sboard_wb_if.valid = writeback_if.valid;
assign sboard_wb_if.uuid = writeback_if.uuid;
assign sboard_wb_if.wid = writeback_if.wid;
assign sboard_wb_if.PC = writeback_if.PC;
assign sboard_wb_if.rd = writeback_if.rd;
assign sboard_wb_if.eop = writeback_if.eop;
// scoreboard interface
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
assign scoreboard_if.uuid = ibuffer_if.uuid;
assign scoreboard_if.wid = ibuffer_if.wid;
assign scoreboard_if.PC = ibuffer_if.PC;
assign scoreboard_if.wb = ibuffer_if.wb;
assign scoreboard_if.rd = ibuffer_if.rd;
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
// dispatch interface
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
assign dispatch_if.uuid = ibuffer_if.uuid;
assign dispatch_if.wid = ibuffer_if.wid;
assign dispatch_if.tmask = ibuffer_if.tmask;
assign dispatch_if.PC = ibuffer_if.PC;
assign dispatch_if.ex_type = ibuffer_if.ex_type;
assign dispatch_if.op_type = ibuffer_if.op_type;
assign dispatch_if.op_mod = ibuffer_if.op_mod;
assign dispatch_if.wb = ibuffer_if.wb;
assign dispatch_if.rd = ibuffer_if.rd;
assign dispatch_if.rs1 = ibuffer_if.rs1;
assign dispatch_if.imm = ibuffer_if.imm;
assign dispatch_if.use_PC = ibuffer_if.use_PC;
assign dispatch_if.use_imm = ibuffer_if.use_imm;
// issue the instruction
assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
`RESET_RELAY (ibuf_reset);
`RESET_RELAY (scoreboard_reset);
`RESET_RELAY (gpr_reset);
`RESET_RELAY (dispatch_reset);
VX_ibuffer #(
.CORE_ID(CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID(CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
.writeback_if(sboard_wb_if),
.ibuffer_if (scoreboard_if)
);
VX_gpr_stage #(
.CORE_ID(CORE_ID)
) gpr_stage (
.clk (clk),
.reset (gpr_reset),
.writeback_if (writeback_if),
.gpr_req_if (gpr_req_if),
.gpr_rsp_if (gpr_rsp_if)
);
VX_dispatch dispatch (
.clk (clk),
.reset (dispatch_reset),
.ibuffer_if (dispatch_if),
.gpr_rsp_if (gpr_rsp_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if)
);
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
`SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid);
`SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask);
`SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type);
`SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type);
`SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod);
`SCOPE_ASSIGN (issue_wb, ibuffer_if.wb);
`SCOPE_ASSIGN (issue_rd, ibuffer_if.rd);
`SCOPE_ASSIGN (issue_rs1, ibuffer_if.rs1);
`SCOPE_ASSIGN (issue_rs2, ibuffer_if.rs2);
`SCOPE_ASSIGN (issue_rs3, ibuffer_if.rs3);
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
`SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready);
`SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready);
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
`SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid);
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
reg [`PERF_CTR_BITS-1:0] perf_alu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_lsu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_csr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_gpu_stalls;
`ifdef EXT_F_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= 0;
perf_scb_stalls <= 0;
perf_alu_stalls <= 0;
perf_lsu_stalls <= 0;
perf_csr_stalls <= 0;
perf_gpu_stalls <= 0;
`ifdef EXT_F_ENABLE
perf_fpu_stalls <= 0;
`endif
end else begin
if (decode_if.valid & ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
end
if (scoreboard_if.valid & ~scoreboard_if.ready) begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
end
if (dispatch_if.valid & ~dispatch_if.ready) begin
case (dispatch_if.ex_type)
`EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
`ifdef EXT_F_ENABLE
`EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
`endif
`EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
`EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
//`EX_GPU:
default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
endcase
end
end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
assign perf_issue_if.alu_stalls = perf_alu_stalls;
assign perf_issue_if.lsu_stalls = perf_lsu_stalls;
assign perf_issue_if.csr_stalls = perf_csr_stalls;
assign perf_issue_if.gpu_stalls = perf_gpu_stalls;
`ifdef EXT_F_ENABLE
assign perf_issue_if.fpu_stalls = perf_fpu_stalls;
`endif
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd);
`TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", alu_req_if.uuid);
end
if (lsu_req_if.valid && lsu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
$time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.offset);
`TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
dpi_trace(", data=");
`TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", lsu_req_if.uuid);
end
if (csr_req_if.valid && csr_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
$time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr);
`TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", csr_req_if.uuid);
end
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid && fpu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd);
`TRACE_ARRAY1D(fpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", fpu_req_if.uuid);
end
`endif
if (gpu_req_if.valid && gpu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=",
$time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd);
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace(" (#%0d)\n", gpu_req_if.uuid);
end
end
`endif
endmodule

View File

@@ -1,372 +0,0 @@
`include "VX_define.vh"
module VX_lsu_unit #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_lsu_unit
input wire clk,
input wire reset,
// Dcache interface
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// inputs
VX_lsu_req_if.slave lsu_req_if,
// outputs
VX_commit_if.master ld_commit_if,
VX_commit_if.master st_commit_if
);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = 32 - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
wire req_valid;
wire [`UUID_BITS-1:0] req_uuid;
wire [`NUM_THREADS-1:0] req_tmask;
wire [`NUM_THREADS-1:0][31:0] req_addr;
wire [`INST_LSU_BITS-1:0] req_type;
wire [`NUM_THREADS-1:0][31:0] req_data;
wire [`NR_BITS-1:0] req_rd;
wire req_wb;
wire [`NW_BITS-1:0] req_wid;
wire [31:0] req_pc;
wire req_is_dup;
wire req_is_prefetch;
wire mbuf_empty;
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
// full address calculation
wire [`NUM_THREADS-1:0][31:0] full_addr;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset;
end
// detect duplicate addresses
wire [`NUM_THREADS-2:0] addr_matches;
for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin
assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1];
end
wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
// is non-cacheable address
wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT));
if (`SM_ENABLE) begin
// is shared memory address
wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT))
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm};
end else begin
assign lsu_addr_type[i] = is_addr_nc;
end
end
// fence stalls the pipeline until all pending requests are sent
wire fence_wait = lsu_req_if.is_fence && (req_valid || !mbuf_empty);
wire ready_in;
wire stall_in = ~ready_in && req_valid;
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) req_pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_in),
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
);
// Can accept new request?
assign lsu_req_if.ready = ~stall_in && ~fence_wait;
wire [`UUID_BITS-1:0] rsp_uuid;
wire [`NW_BITS-1:0] rsp_wid;
wire [31:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [`INST_LSU_BITS-1:0] rsp_type;
wire rsp_is_dup;
wire rsp_is_prefetch;
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
wire [`NUM_THREADS-1:0] rsp_tmask;
reg [`NUM_THREADS-1:0] req_sent_mask;
reg is_req_start;
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
wire mbuf_full;
`UNUSED_VAR (rsp_type)
`UNUSED_VAR (rsp_is_prefetch)
wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign req_offset[i] = req_addr[i][1:0];
end
wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
wire mbuf_push = ~mbuf_full
&& (| ({`NUM_THREADS{req_valid}} & req_tmask_dup & dcache_req_if.ready))
&& is_req_start // first submission only
&& req_wb; // loads only
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS];
`UNUSED_VAR (dcache_rsp_if.tag)
// do not writeback from software prefetch
wire req_wb2 = req_wb && ~req_is_prefetch;
VX_index_buffer #(
.DATAW (`UUID_BITS + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
.SIZE (`LSUQ_SIZE)
) req_metadata (
.clk (clk),
.reset (reset),
.write_addr (mbuf_waddr),
.acquire_slot (mbuf_push),
.read_addr (mbuf_raddr),
.write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
.read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
.release_addr (mbuf_raddr),
.release_slot (mbuf_pop),
.full (mbuf_full),
.empty (mbuf_empty)
);
wire dcache_req_ready = &(dcache_req_if.ready | req_sent_mask | ~req_tmask_dup);
wire [`NUM_THREADS-1:0] req_sent_mask_n = req_sent_mask | dcache_req_fire;
always @(posedge clk) begin
if (reset) begin
req_sent_mask <= 0;
is_req_start <= 1;
end else begin
if (dcache_req_ready) begin
req_sent_mask <= 0;
is_req_start <= 1;
end else begin
req_sent_mask <= req_sent_mask_n;
is_req_start <= (0 == req_sent_mask_n);
end
end
end
// need to hold the acquired tag index until the full request is submitted
reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold;
wire [`LSUQ_ADDR_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold;
always @(posedge clk) begin
if (mbuf_push) begin
req_tag_hold <= mbuf_waddr;
end
end
assign rsp_rem_mask_n = rsp_rem_mask[mbuf_raddr] & ~dcache_rsp_if.tmask;
always @(posedge clk) begin
if (mbuf_push) begin
rsp_rem_mask[mbuf_waddr] <= req_tmask_dup;
end
if (dcache_rsp_fire) begin
rsp_rem_mask[mbuf_raddr] <= rsp_rem_mask_n;
end
end
// ensure all dependencies for the requests are resolved
wire req_dep_ready = (req_wb && ~(mbuf_full && is_req_start))
|| (~req_wb && st_commit_if.ready);
// DCache Request
for (genvar i = 0; i < `NUM_THREADS; i++) begin
reg [3:0] mem_req_byteen;
reg [31:0] mem_req_data;
always @(*) begin
mem_req_byteen = {4{req_wb}};
case (`INST_LSU_WSIZE(req_type))
0: mem_req_byteen[req_offset[i]] = 1;
1: begin
mem_req_byteen[req_offset[i]] = 1;
mem_req_byteen[{req_offset[i][1], 1'b1}] = 1;
end
default : mem_req_byteen = {4{1'b1}};
endcase
end
always @(*) begin
mem_req_data = req_data[i];
case (req_offset[i])
1: mem_req_data[31:8] = req_data[i][23:0];
2: mem_req_data[31:16] = req_data[i][15:0];
3: mem_req_data[31:24] = req_data[i][7:0];
default:;
endcase
end
assign dcache_req_if.valid[i] = req_valid && req_dep_ready && req_tmask_dup[i] && !req_sent_mask[i];
assign dcache_req_if.rw[i] = ~req_wb;
assign dcache_req_if.addr[i] = req_addr[i][31:2];
assign dcache_req_if.byteen[i] = mem_req_byteen;
assign dcache_req_if.data[i] = mem_req_data;
assign dcache_req_if.tag[i] = {req_uuid, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]};
end
assign ready_in = req_dep_ready && dcache_req_ready;
// send store commit
wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready;
assign st_commit_if.valid = is_store_rsp;
assign st_commit_if.uuid = req_uuid;
assign st_commit_if.wid = req_wid;
assign st_commit_if.tmask = req_tmask;
assign st_commit_if.PC = req_pc;
assign st_commit_if.rd = 0;
assign st_commit_if.wb = 0;
assign st_commit_if.eop = 1'b1;
assign st_commit_if.data = 0;
// load response formatting
reg [`NUM_THREADS-1:0][31:0] rsp_data;
wire [`NUM_THREADS-1:0] rsp_tmask_qual;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i];
wire [15:0] rsp_data16 = rsp_offset[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_offset[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_type))
`INST_FMT_B: rsp_data[i] = 32'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = 32'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16));
default: rsp_data[i] = rsp_data32;
endcase
end
end
assign rsp_tmask_qual = rsp_is_dup ? rsp_tmask : dcache_rsp_if.tmask;
// send load commit
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
.RESETW (1)
) rsp_pipe_reg (
.clk (clk),
.reset (reset),
.enable (!load_rsp_stall),
.data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
.data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
);
// Can accept new cache response?
assign dcache_rsp_if.ready = ~load_rsp_stall;
// scope registration
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
`SCOPE_ASSIGN (dcache_req_uuid, req_uuid);
`SCOPE_ASSIGN (dcache_req_addr, req_addr);
`SCOPE_ASSIGN (dcache_req_rw, ~req_wb);
`SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen);
`SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data);
`SCOPE_ASSIGN (dcache_req_tag, req_tag);
`SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}});
`SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid);
`SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data);
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
`ifndef SYNTHESIS
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
if (reset) begin
pending_reqs <= '0;
end begin
if (mbuf_push) begin
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1};
end
if (mbuf_pop) begin
pending_reqs[mbuf_raddr] <= '0;
end
end
for (integer i = 0; i < `LSUQ_SIZE; ++i) begin
if (pending_reqs[i][0]) begin
`ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout,
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)",
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+`UUID_BITS+`NR_BITS+32 +: `NW_BITS],
pending_reqs[i][1+64+`UUID_BITS+`NR_BITS +: 32],
pending_reqs[i][1+64+`UUID_BITS +: `NR_BITS],
pending_reqs[i][1+64 +: `UUID_BITS]));
end
end
end
`endif
`ifdef DBG_TRACE_CORE_DCACHE
wire dcache_req_fire_any = (| dcache_req_fire);
always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin
dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID);
end
if (dcache_req_fire_any) begin
if (dcache_req_if.rw[0]) begin
dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
dpi_trace(", data=");
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
dpi_trace(", (#%0d)\n", req_uuid);
end else begin
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid);
end
end
if (dcache_rsp_fire) begin
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid);
end
end
`endif
endmodule

View File

@@ -1,146 +0,0 @@
`include "VX_define.vh"
module VX_mem_arb #(
parameter NUM_REQS = 1,
parameter DATA_WIDTH = 1,
parameter ADDR_WIDTH = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "P",
parameter DATA_SIZE = (DATA_WIDTH / 8),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
) (
input wire clk,
input wire reset,
// input requests
input wire [NUM_REQS-1:0] req_valid_in,
input wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr_in,
input wire [NUM_REQS-1:0] req_rw_in,
input wire [NUM_REQS-1:0][DATA_SIZE-1:0] req_byteen_in,
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] req_data_in,
output wire [NUM_REQS-1:0] req_ready_in,
// output request
output wire req_valid_out,
output wire [TAG_OUT_WIDTH-1:0] req_tag_out,
output wire [ADDR_WIDTH-1:0] req_addr_out,
output wire req_rw_out,
output wire [DATA_SIZE-1:0] req_byteen_out,
output wire [DATA_WIDTH-1:0] req_data_out,
input wire req_ready_out,
// input response
input wire rsp_valid_in,
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
input wire [DATA_WIDTH-1:0] rsp_data_in,
output wire rsp_ready_in,
// output responses
output wire [NUM_REQS-1:0] rsp_valid_out,
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out,
input wire [NUM_REQS-1:0] rsp_ready_out
);
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_IN_WIDTH + DATA_WIDTH;
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in_merged;
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
VX_bits_insert #(
.N (TAG_IN_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (req_tag_in[i]),
.sel_in (LOG_NUM_REQS'(i)),
.data_out (req_tag_in_w)
);
assign req_data_in_merged[i] = {req_tag_in_w, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]};
end
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ),
.TYPE (TYPE)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (req_valid_in),
.data_in (req_data_in_merged),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}),
.ready_out (req_ready_out)
);
///////////////////////////////////////////////////////////////////////
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
VX_bits_remove #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (rsp_tag_in),
.data_out (rsp_tag_in_w)
);
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP)
) rsp_demux (
.clk (clk),
.reset (reset),
.sel_in (rsp_sel),
.valid_in (rsp_valid_in),
.data_in ({rsp_tag_in_w, rsp_data_in}),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out_merged),
.ready_out (rsp_ready_out)
);
for (genvar i = 0; i < NUM_REQS; i++) begin
assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign req_valid_out = req_valid_in;
assign req_tag_out = req_tag_in;
assign req_addr_out = req_addr_in;
assign req_rw_out = req_rw_in;
assign req_byteen_out = req_byteen_in;
assign req_data_out = req_data_in;
assign req_ready_in = req_ready_out;
assign rsp_valid_out = rsp_valid_in;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
end
endmodule

View File

@@ -1,420 +0,0 @@
`include "VX_define.vh"
module VX_mem_unit # (
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_mem_unit
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if.master perf_memsys_if,
`endif
// Core <-> Dcache
VX_dcache_req_if.slave dcache_req_if,
VX_dcache_rsp_if.master dcache_rsp_if,
// Core <-> Icache
VX_icache_req_if.slave icache_req_if,
VX_icache_rsp_if.master icache_rsp_if,
// Memory
VX_mem_req_if.master mem_req_if,
VX_mem_rsp_if.slave mem_rsp_if
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
`endif
VX_mem_req_if #(
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`ICACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache_mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache_mem_rsp_if();
VX_mem_req_if #(
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
) dcache_mem_req_if();
VX_mem_rsp_if #(
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
) dcache_mem_rsp_if();
VX_dcache_req_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) dcache_req_tmp_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) dcache_rsp_tmp_if();
`RESET_RELAY (icache_reset);
`RESET_RELAY (dcache_reset);
`RESET_RELAY (mem_arb_reset);
VX_cache #(
.CACHE_ID (`ICACHE_ID),
.CACHE_SIZE (`ICACHE_SIZE),
.CACHE_LINE_SIZE (`ICACHE_LINE_SIZE),
.NUM_BANKS (1),
.WORD_SIZE (`ICACHE_WORD_SIZE),
.NUM_REQS (1),
.CREQ_SIZE (`ICACHE_CREQ_SIZE),
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
.WRITE_ENABLE (0),
.CORE_TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICACHE_CORE_TAG_ID_BITS),
.MEM_TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
) icache (
`SCOPE_BIND_VX_mem_unit_icache
.clk (clk),
.reset (icache_reset),
// Core request
.core_req_valid (icache_req_if.valid),
.core_req_rw (1'b0),
.core_req_byteen ('b0),
.core_req_addr (icache_req_if.addr),
.core_req_data ('x),
.core_req_tag (icache_req_if.tag),
.core_req_ready (icache_req_if.ready),
// Core response
.core_rsp_valid (icache_rsp_if.valid),
.core_rsp_data (icache_rsp_if.data),
.core_rsp_tag (icache_rsp_if.tag),
.core_rsp_ready (icache_rsp_if.ready),
`UNUSED_PIN (core_rsp_tmask),
`ifdef PERF_ENABLE
.perf_cache_if (perf_icache_if),
`endif
// Memory Request
.mem_req_valid (icache_mem_req_if.valid),
.mem_req_rw (icache_mem_req_if.rw),
.mem_req_byteen (icache_mem_req_if.byteen),
.mem_req_addr (icache_mem_req_if.addr),
.mem_req_data (icache_mem_req_if.data),
.mem_req_tag (icache_mem_req_if.tag),
.mem_req_ready (icache_mem_req_if.ready),
// Memory response
.mem_rsp_valid (icache_mem_rsp_if.valid),
.mem_rsp_data (icache_mem_rsp_if.data),
.mem_rsp_tag (icache_mem_rsp_if.tag),
.mem_rsp_ready (icache_mem_rsp_if.ready)
);
VX_cache #(
.CACHE_ID (`DCACHE_ID),
.CACHE_SIZE (`DCACHE_SIZE),
.CACHE_LINE_SIZE (`DCACHE_LINE_SIZE),
.NUM_BANKS (`DCACHE_NUM_BANKS),
.NUM_PORTS (`DCACHE_NUM_PORTS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.NUM_REQS (`DCACHE_NUM_REQS),
.CREQ_SIZE (`DCACHE_CREQ_SIZE),
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
.MEM_TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) dcache (
`SCOPE_BIND_VX_mem_unit_dcache
.clk (clk),
.reset (dcache_reset),
// Core req
.core_req_valid (dcache_req_tmp_if.valid),
.core_req_rw (dcache_req_tmp_if.rw),
.core_req_byteen (dcache_req_tmp_if.byteen),
.core_req_addr (dcache_req_tmp_if.addr),
.core_req_data (dcache_req_tmp_if.data),
.core_req_tag (dcache_req_tmp_if.tag),
.core_req_ready (dcache_req_tmp_if.ready),
// Core response
.core_rsp_valid (dcache_rsp_tmp_if.valid),
.core_rsp_tmask (dcache_rsp_tmp_if.tmask),
.core_rsp_data (dcache_rsp_tmp_if.data),
.core_rsp_tag (dcache_rsp_tmp_if.tag),
.core_rsp_ready (dcache_rsp_tmp_if.ready),
`ifdef PERF_ENABLE
.perf_cache_if (perf_dcache_if),
`endif
// Memory request
.mem_req_valid (dcache_mem_req_if.valid),
.mem_req_rw (dcache_mem_req_if.rw),
.mem_req_byteen (dcache_mem_req_if.byteen),
.mem_req_addr (dcache_mem_req_if.addr),
.mem_req_data (dcache_mem_req_if.data),
.mem_req_tag (dcache_mem_req_if.tag),
.mem_req_ready (dcache_mem_req_if.ready),
// Memory response
.mem_rsp_valid (dcache_mem_rsp_if.valid),
.mem_rsp_data (dcache_mem_rsp_if.data),
.mem_rsp_tag (dcache_mem_rsp_if.tag),
.mem_rsp_ready (dcache_mem_rsp_if.ready)
);
if (`SM_ENABLE) begin
VX_dcache_req_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) smem_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
.WORD_SIZE (`DCACHE_WORD_SIZE),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
) smem_rsp_if();
`RESET_RELAY (smem_arb_reset);
`RESET_RELAY (smem_reset);
VX_smem_arb #(
.NUM_REQS (2),
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
.TAG_SEL_IDX (0), // SM flag
.TYPE ("P"),
.BUFFERED_REQ (2),
.BUFFERED_RSP (1)
) smem_arb (
.clk (clk),
.reset (smem_arb_reset),
// input request
.req_valid_in (dcache_req_if.valid),
.req_rw_in (dcache_req_if.rw),
.req_byteen_in (dcache_req_if.byteen),
.req_addr_in (dcache_req_if.addr),
.req_data_in (dcache_req_if.data),
.req_tag_in (dcache_req_if.tag),
.req_ready_in (dcache_req_if.ready),
// output requests
.req_valid_out ({smem_req_if.valid, dcache_req_tmp_if.valid}),
.req_rw_out ({smem_req_if.rw, dcache_req_tmp_if.rw}),
.req_byteen_out ({smem_req_if.byteen, dcache_req_tmp_if.byteen}),
.req_addr_out ({smem_req_if.addr, dcache_req_tmp_if.addr}),
.req_data_out ({smem_req_if.data, dcache_req_tmp_if.data}),
.req_tag_out ({smem_req_if.tag, dcache_req_tmp_if.tag}),
.req_ready_out ({smem_req_if.ready, dcache_req_tmp_if.ready}),
// input responses
.rsp_valid_in ({smem_rsp_if.valid, dcache_rsp_tmp_if.valid}),
.rsp_tmask_in ({smem_rsp_if.tmask, dcache_rsp_tmp_if.tmask}),
.rsp_data_in ({smem_rsp_if.data, dcache_rsp_tmp_if.data}),
.rsp_tag_in ({smem_rsp_if.tag, dcache_rsp_tmp_if.tag}),
.rsp_ready_in ({smem_rsp_if.ready, dcache_rsp_tmp_if.ready}),
// output response
.rsp_valid_out (dcache_rsp_if.valid),
.rsp_tmask_out (dcache_rsp_if.tmask),
.rsp_tag_out (dcache_rsp_if.tag),
.rsp_data_out (dcache_rsp_if.data),
.rsp_ready_out (dcache_rsp_if.ready)
);
VX_shared_mem #(
.CACHE_ID (`SMEM_ID),
.CACHE_SIZE (`SMEM_SIZE),
.NUM_BANKS (`SMEM_NUM_BANKS),
.WORD_SIZE (`SMEM_WORD_SIZE),
.NUM_REQS (`SMEM_NUM_REQS),
.CREQ_SIZE (`SMEM_CREQ_SIZE),
.CRSQ_SIZE (`SMEM_CRSQ_SIZE),
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
.BANK_ADDR_OFFSET (`SMEM_BANK_ADDR_OFFSET)
) smem (
.clk (clk),
.reset (smem_reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_smem_if),
`endif
// Core request
.core_req_valid (smem_req_if.valid),
.core_req_rw (smem_req_if.rw),
.core_req_byteen (smem_req_if.byteen),
.core_req_addr (smem_req_if.addr),
.core_req_data (smem_req_if.data),
.core_req_tag (smem_req_if.tag),
.core_req_ready (smem_req_if.ready),
// Core response
.core_rsp_valid (smem_rsp_if.valid),
.core_rsp_tmask (smem_rsp_if.tmask),
.core_rsp_data (smem_rsp_if.data),
.core_rsp_tag (smem_rsp_if.tag),
.core_rsp_ready (smem_rsp_if.ready)
);
end else begin
// core to D-cache request
for (genvar i = 0; i < `DCACHE_NUM_REQS; ++i) begin
VX_skid_buffer #(
.DATAW ((32-`CLOG2(`DCACHE_WORD_SIZE)) + 1 + `DCACHE_WORD_SIZE + (8*`DCACHE_WORD_SIZE) + `DCACHE_CORE_TAG_WIDTH)
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (dcache_req_if.valid[i]),
.data_in ({dcache_req_if.addr[i], dcache_req_if.rw[i], dcache_req_if.byteen[i], dcache_req_if.data[i], dcache_req_if.tag[i]}),
.ready_in (dcache_req_if.ready[i]),
.valid_out (dcache_req_tmp_if.valid[i]),
.data_out ({dcache_req_tmp_if.addr[i], dcache_req_tmp_if.rw[i], dcache_req_tmp_if.byteen[i], dcache_req_tmp_if.data[i], dcache_req_tmp_if.tag[i]}),
.ready_out (dcache_req_tmp_if.ready[i])
);
end
// D-cache to core reponse
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;
assign dcache_rsp_if.tmask = dcache_rsp_tmp_if.tmask;
assign dcache_rsp_if.tag = dcache_rsp_tmp_if.tag;
assign dcache_rsp_if.data = dcache_rsp_tmp_if.data;
assign dcache_rsp_tmp_if.ready = dcache_rsp_if.ready;
end
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_req_tag = `DCACHE_MEM_TAG_WIDTH'(icache_mem_req_if.tag);
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_rsp_tag;
assign icache_mem_rsp_if.tag = icache_mem_rsp_tag[`ICACHE_MEM_TAG_WIDTH-1:0];
`UNUSED_VAR (icache_mem_rsp_tag)
VX_mem_arb #(
.NUM_REQS (2),
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`DCACHE_MEM_TAG_WIDTH),
.TYPE ("R"),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.BUFFERED_REQ (1),
.BUFFERED_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
// Source request
.req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}),
.req_rw_in ({dcache_mem_req_if.rw, icache_mem_req_if.rw}),
.req_byteen_in ({dcache_mem_req_if.byteen, icache_mem_req_if.byteen}),
.req_addr_in ({dcache_mem_req_if.addr, icache_mem_req_if.addr}),
.req_data_in ({dcache_mem_req_if.data, icache_mem_req_if.data}),
.req_tag_in ({dcache_mem_req_if.tag, icache_mem_req_tag}),
.req_ready_in ({dcache_mem_req_if.ready, icache_mem_req_if.ready}),
// Memory request
.req_valid_out (mem_req_if.valid),
.req_rw_out (mem_req_if.rw),
.req_byteen_out (mem_req_if.byteen),
.req_addr_out (mem_req_if.addr),
.req_data_out (mem_req_if.data),
.req_tag_out (mem_req_if.tag),
.req_ready_out (mem_req_if.ready),
// Source response
.rsp_valid_out ({dcache_mem_rsp_if.valid, icache_mem_rsp_if.valid}),
.rsp_data_out ({dcache_mem_rsp_if.data, icache_mem_rsp_if.data}),
.rsp_tag_out ({dcache_mem_rsp_if.tag, icache_mem_rsp_tag}),
.rsp_ready_out ({dcache_mem_rsp_if.ready, icache_mem_rsp_if.ready}),
// Memory response
.rsp_valid_in (mem_rsp_if.valid),
.rsp_tag_in (mem_rsp_if.tag),
.rsp_data_in (mem_rsp_if.data),
.rsp_ready_in (mem_rsp_if.ready)
);
`ifdef PERF_ENABLE
`UNUSED_VAR (perf_dcache_if.mem_stalls)
`UNUSED_VAR (perf_dcache_if.crsp_stalls)
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses;
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses;
assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses;
assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
if (`SM_ENABLE) begin
assign perf_memsys_if.smem_reads = perf_smem_if.reads;
assign perf_memsys_if.smem_writes = perf_smem_if.writes;
assign perf_memsys_if.smem_bank_stalls = perf_smem_if.bank_stalls;
end else begin
assign perf_memsys_if.smem_reads = 0;
assign perf_memsys_if.smem_writes = 0;
assign perf_memsys_if.smem_bank_stalls = 0;
end
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= 0;
end else begin
perf_mem_pending_reads <= perf_mem_pending_reads +
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
end
end
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
always @(posedge clk) begin
if (reset) begin
perf_mem_reads <= 0;
perf_mem_writes <= 0;
perf_mem_lat <= 0;
end else begin
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
end
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
end
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
end
end
assign perf_memsys_if.mem_reads = perf_mem_reads;
assign perf_memsys_if.mem_writes = perf_mem_writes;
assign perf_memsys_if.mem_latency = perf_mem_lat;
`endif
endmodule

View File

@@ -1,226 +0,0 @@
`include "VX_define.vh"
module VX_muldiv (
input wire clk,
input wire reset,
// Inputs
input wire [`INST_MUL_BITS-1:0] alu_op,
input wire [`UUID_BITS-1:0] uuid_in,
input wire [`NW_BITS-1:0] wid_in,
input wire [`NUM_THREADS-1:0] tmask_in,
input wire [31:0] PC_in,
input wire [`NR_BITS-1:0] rd_in,
input wire wb_in,
input wire [`NUM_THREADS-1:0][31:0] alu_in1,
input wire [`NUM_THREADS-1:0][31:0] alu_in2,
// Outputs
output wire [`UUID_BITS-1:0] uuid_out,
output wire [`NW_BITS-1:0] wid_out,
output wire [`NUM_THREADS-1:0] tmask_out,
output wire [31:0] PC_out,
output wire [`NR_BITS-1:0] rd_out,
output wire wb_out,
output wire [`NUM_THREADS-1:0][31:0] data_out,
// handshake
input wire valid_in,
output wire ready_in,
output wire valid_out,
input wire ready_out
);
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
wire [`NUM_THREADS-1:0][31:0] mul_result;
wire [`UUID_BITS-1:0] mul_uuid_out;
wire [`NW_BITS-1:0] mul_wid_out;
wire [`NUM_THREADS-1:0] mul_tmask_out;
wire [31:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire stall_out;
wire mul_valid_out;
wire mul_valid_in = valid_in && !is_div_op;
wire mul_ready_in = ~stall_out || ~mul_valid_out;
wire is_mulh_in = (alu_op != `INST_MUL_MUL);
wire is_signed_mul_a = (alu_op != `INST_MUL_MULHU);
wire is_signed_mul_b = (alu_op != `INST_MUL_MULHU && alu_op != `INST_MUL_MULHSU);
`ifdef IMUL_DPI
wire [`NUM_THREADS-1:0][31:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] mul_resultl, mul_resulth;
always @(*) begin
dpi_imul (mul_fire_in, alu_in1[i], alu_in2[i], is_signed_mul_a, is_signed_mul_b, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : mul_resultl;
end
VX_shift_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
);
`else
wire is_mulh_out;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] mul_in1 = {is_signed_mul_a & alu_in1[i][31], alu_in1[i]};
wire [32:0] mul_in2 = {is_signed_mul_b & alu_in2[i][31], alu_in2[i]};
`IGNORE_UNUSED_BEGIN
wire [65:0] mul_result_tmp;
`IGNORE_UNUSED_END
VX_multiplier #(
.WIDTHA (33),
.WIDTHB (33),
.WIDTHP (66),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
assign mul_result[i] = is_mulh_out ? mul_result_tmp[63:32] : mul_result_tmp[31:0];
end
VX_shift_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
);
`endif
///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] div_result;
wire [`UUID_BITS-1:0] div_uuid_out;
wire [`NW_BITS-1:0] div_wid_out;
wire [`NUM_THREADS-1:0] div_tmask_out;
wire [31:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire is_rem_op_in = (alu_op == `INST_MUL_REM) || (alu_op == `INST_MUL_REMU);
wire is_signed_div = (alu_op == `INST_MUL_DIV) || (alu_op == `INST_MUL_REM);
wire div_valid_in = valid_in && is_div_op;
wire div_ready_out = ~stall_out && ~mul_valid_out; // arbitration prioritizes MUL
wire div_ready_in;
wire div_valid_out;
`ifdef IDIV_DPI
wire [`NUM_THREADS-1:0][31:0] div_result_tmp;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, alu_in1[i], alu_in2[i], is_signed_div, div_quotient, div_remainder);
end
assign div_result_tmp[i] = is_rem_op_in ? div_remainder : div_quotient;
end
VX_shift_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
.clk(clk),
.reset (reset),
.enable (div_ready_in),
.data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
);
assign div_ready_in = div_ready_out || ~div_valid_out;
`else
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
wire is_rem_op_out;
VX_serial_div #(
.WIDTHN (32),
.WIDTHD (32),
.WIDTHQ (32),
.WIDTHR (32),
.LANES (`NUM_THREADS),
.TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
) divide (
.clk (clk),
.reset (reset),
.valid_in (div_valid_in),
.ready_in (div_ready_in),
.signed_mode(is_signed_div),
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
.numer (alu_in1),
.denom (alu_in2),
.quotient (div_result_tmp),
.remainder (rem_result_tmp),
.ready_out (div_ready_out),
.valid_out (div_valid_out),
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
);
assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
`endif
///////////////////////////////////////////////////////////////////////////
wire rsp_valid = mul_valid_out || div_valid_out;
wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out;
wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
wire [`NR_BITS-1:0] rsp_rd = mul_valid_out ? mul_rd_out : div_rd_out;
wire rsp_wb = mul_valid_out ? mul_wb_out : div_wb_out;
wire [`NUM_THREADS-1:0][31:0] rsp_data = mul_valid_out ? mul_result : div_result;
assign stall_out = ~ready_out && valid_out;
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
.data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
);
// can accept new request?
assign ready_in = is_div_op ? div_ready_in : mul_ready_in;
endmodule

View File

@@ -1,261 +0,0 @@
`include "VX_define.vh"
module VX_pipeline #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_pipeline
// Clock
input wire clk,
input wire reset,
// Dcache core request
output wire [`NUM_THREADS-1:0] dcache_req_valid,
output wire [`NUM_THREADS-1:0] dcache_req_rw,
output wire [`NUM_THREADS-1:0][3:0] dcache_req_byteen,
output wire [`NUM_THREADS-1:0][29:0] dcache_req_addr,
output wire [`NUM_THREADS-1:0][31:0] dcache_req_data,
output wire [`NUM_THREADS-1:0][`DCACHE_CORE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [`NUM_THREADS-1:0] dcache_req_ready,
// Dcache core reponse
input wire dcache_rsp_valid,
input wire [`NUM_THREADS-1:0] dcache_rsp_tmask,
input wire [`NUM_THREADS-1:0][31:0] dcache_rsp_data,
input wire [`DCACHE_CORE_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire dcache_rsp_ready,
// Icache core request
output wire icache_req_valid,
output wire [29:0] icache_req_addr,
output wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_req_tag,
input wire icache_req_ready,
// Icache core response
input wire icache_rsp_valid,
input wire [31:0] icache_rsp_data,
input wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_rsp_tag,
output wire icache_rsp_ready,
`ifdef PERF_ENABLE
VX_perf_memsys_if.slave perf_memsys_if,
`endif
// Status
output wire busy
);
//
// Dcache request
//
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_req_if();
assign dcache_req_valid = dcache_req_if.valid;
assign dcache_req_rw = dcache_req_if.rw;
assign dcache_req_byteen = dcache_req_if.byteen;
assign dcache_req_addr = dcache_req_if.addr;
assign dcache_req_data = dcache_req_if.data;
assign dcache_req_tag = dcache_req_if.tag;
assign dcache_req_if.ready = dcache_req_ready;
//
// Dcache response
//
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
) dcache_rsp_if();
assign dcache_rsp_if.valid = dcache_rsp_valid;
assign dcache_rsp_if.tmask = dcache_rsp_tmask;
assign dcache_rsp_if.data = dcache_rsp_data;
assign dcache_rsp_if.tag = dcache_rsp_tag;
assign dcache_rsp_ready = dcache_rsp_if.ready;
//
// Icache request
//
VX_icache_req_if #(
.WORD_SIZE (4),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_req_if();
assign icache_req_valid = icache_req_if.valid;
assign icache_req_addr = icache_req_if.addr;
assign icache_req_tag = icache_req_if.tag;
assign icache_req_if.ready = icache_req_ready;
//
// Icache response
//
VX_icache_rsp_if #(
.WORD_SIZE (4),
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
) icache_rsp_if();
assign icache_rsp_if.valid = icache_rsp_valid;
assign icache_rsp_if.data = icache_rsp_data;
assign icache_rsp_if.tag = icache_rsp_tag;
assign icache_rsp_ready = icache_rsp_if.ready;
///////////////////////////////////////////////////////////////////////////
VX_fetch_to_csr_if fetch_to_csr_if();
VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if();
VX_warp_ctl_if warp_ctl_if();
VX_ifetch_rsp_if ifetch_rsp_if();
VX_alu_req_if alu_req_if();
VX_lsu_req_if lsu_req_if();
VX_csr_req_if csr_req_if();
`ifdef EXT_F_ENABLE
VX_fpu_req_if fpu_req_if();
`endif
VX_gpu_req_if gpu_req_if();
VX_writeback_if writeback_if();
VX_wstall_if wstall_if();
VX_join_if join_if();
VX_commit_if alu_commit_if();
VX_commit_if ld_commit_if();
VX_commit_if st_commit_if();
VX_commit_if csr_commit_if();
`ifdef EXT_F_ENABLE
VX_commit_if fpu_commit_if();
`endif
VX_commit_if gpu_commit_if();
`ifdef PERF_ENABLE
VX_perf_pipeline_if perf_pipeline_if();
`endif
`RESET_RELAY (fetch_reset);
`RESET_RELAY (decode_reset);
`RESET_RELAY (issue_reset);
`RESET_RELAY (execute_reset);
`RESET_RELAY (commit_reset);
VX_fetch #(
.CORE_ID(CORE_ID)
) fetch (
`SCOPE_BIND_VX_pipeline_fetch
.clk (clk),
.reset (fetch_reset),
.icache_req_if (icache_req_if),
.icache_rsp_if (icache_rsp_if),
.wstall_if (wstall_if),
.join_if (join_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if(fetch_to_csr_if),
.busy (busy)
);
VX_decode #(
.CORE_ID(CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
`ifdef PERF_ENABLE
.perf_decode_if (perf_pipeline_if.decode),
`endif
.ifetch_rsp_if (ifetch_rsp_if),
.decode_if (decode_if),
.wstall_if (wstall_if),
.join_if (join_if)
);
VX_issue #(
.CORE_ID(CORE_ID)
) issue (
`SCOPE_BIND_VX_pipeline_issue
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (perf_pipeline_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if)
);
VX_execute #(
.CORE_ID(CORE_ID)
) execute (
`SCOPE_BIND_VX_pipeline_execute
.clk (clk),
.reset (execute_reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
`endif
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
`ifdef EXT_F_ENABLE
.fpu_req_if (fpu_req_if),
`endif
.gpu_req_if (gpu_req_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.busy (busy)
);
VX_commit #(
.CORE_ID(CORE_ID)
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if),
.cmt_to_csr_if (cmt_to_csr_if)
);
endmodule

View File

@@ -1,5 +1,18 @@
`ifndef VX_PLATFORM
`define VX_PLATFORM
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_PLATFORM_VH
`define VX_PLATFORM_VH
`ifndef SYNTHESIS
`include "util_dpi.vh"
@@ -9,8 +22,36 @@
///////////////////////////////////////////////////////////////////////////////
`ifndef SYNTHESIS
`ifdef VIVADO
`define STRING
`else
`define STRING string
`endif
`ifdef SYNTHESIS
`define TRACING_ON
`define TRACING_OFF
`ifndef NDEBUG
`define DEBUG_BLOCK(x) x
`else
`define DEBUG_BLOCK(x)
`endif
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) $write args
`else
`ifdef VERILATOR
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@@ -19,6 +60,10 @@
`define DEBUG_BLOCK(x)
`endif
`define IGNORE_UNOPTFLAT_BEGIN /* verilator lint_off UNOPTFLAT */
`define IGNORE_UNOPTFLAT_END /* verilator lint_off UNOPTFLAT */
`define IGNORE_UNUSED_BEGIN /* verilator lint_off UNUSED */
`define IGNORE_UNUSED_END /* verilator lint_on UNUSED */
@@ -30,7 +75,9 @@
/* verilator lint_off UNDRIVEN */ \
/* verilator lint_off DECLFILENAME */ \
/* verilator lint_off IMPLICIT */ \
/* verilator lint_off IMPORTSTAR */
/* verilator lint_off PINMISSING */ \
/* verilator lint_off IMPORTSTAR */ \
/* verilator lint_off UNSIGNED */
`define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \
/* verilator lint_on PINCONNECTEMPTY */ \
@@ -39,68 +86,80 @@
/* verilator lint_on UNDRIVEN */ \
/* verilator lint_on DECLFILENAME */ \
/* verilator lint_on IMPLICIT */ \
/* verilator lint_on IMPORTSTAR */
/* verilator lint_off PINMISSING */ \
/* verilator lint_on IMPORTSTAR */ \
/* verilator lint_on UNSIGNED */
`define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \
localparam __``x = x; \
/* verilator lint_on UNUSED */
`define UNUSED_VAR(x) always @(x) begin end
`define UNUSED_SPARAM(x) /* verilator lint_off UNUSED */ \
localparam `STRING __``x = x; \
/* verilator lint_on UNUSED */
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define UNUSED_VAR(x) if (1) begin \
/* verilator lint_off UNUSED */ \
wire [$bits(x)-1:0] __x = x; \
/* verilator lint_on UNUSED */ \
end
`define ERROR(msg) \
$error msg
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif
`endif
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define STATIC_ASSERT(cond, msg) \
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
generate \
if (!(cond)) $error msg; \
endgenerate
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`define ERROR(msg) \
$error msg
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`else // SYNTHESIS
`define DEBUG_BLOCK(x)
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define ERROR(msg)
`define ASSERT(cond, msg) if (cond);
`define STATIC_ASSERT(cond, msg)
`define RUNTIME_ASSERT(cond, msg)
`define TRACING_ON
`define TRACING_OFF
`endif // SYNTHESIS
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`else
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`endif
///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS
`define MAX_FANOUT 4
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_REG (* preserve *)
`define PRESERVE_NET (* preserve *)
`elsif VIVADO
`define MAX_FANOUT 4
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *)
`else
`define MAX_FANOUT 4
`define IF_DATA_SIZE(x) x.DATA_WIDTH
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK
`define DISABLE_BRAM
`define PRESERVE_REG
`define PRESERVE_NET
`endif
///////////////////////////////////////////////////////////////////////////////
@@ -112,52 +171,105 @@
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : (x));
`define ABS(x) (((x) < 0) ? (-(x)) : (x));
`ifndef MIN
`define MIN(x, y) (((x) < (y)) ? (x) : (y))
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
`endif
`define UP(x) (((x) > 0) ? (x) : 1)
`ifndef MAX
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
`endif
`ifndef CLAMP
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`endif
`ifndef UP
`define UP(x) (((x) != 0) ? (x) : 1)
`endif
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
`define LTRIM(x, s) x[s-1:0]
`define TRACE_ARRAY1D(a, m) \
dpi_trace("{"); \
for (integer i = (m-1); i >= 0; --i) begin \
if (i != (m-1)) dpi_trace(", "); \
dpi_trace("0x%0h", a[i]); \
`define TRACE_ARRAY1D(lvl, arr, m) \
`TRACE(lvl, ("{")); \
for (integer __i = (m-1); __i >= 0; --__i) begin \
if (__i != (m-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("0x%0h", arr[__i])); \
end \
dpi_trace("}"); \
`TRACE(lvl, ("}"));
`define TRACE_ARRAY2D(a, m, n) \
dpi_trace("{"); \
for (integer i = n-1; i >= 0; --i) begin \
if (i != (n-1)) dpi_trace(", "); \
dpi_trace("{"); \
for (integer j = (m-1); j >= 0; --j) begin \
if (j != (m-1)) dpi_trace(", "); \
dpi_trace("0x%0h", a[i][j]); \
`define TRACE_ARRAY2D(lvl, arr, m, n) \
`TRACE(lvl, ("{")); \
for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \
for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, ("0x%0h", arr[__i][__j])); \
end \
dpi_trace("}"); \
`TRACE(lvl, ("}")); \
end \
dpi_trace("}")
`TRACE(lvl, ("}"))
`define RESET_RELAY(signal) \
wire signal; \
VX_reset_relay __``signal ( \
.clk (clk), \
.reset (reset), \
.reset_o (signal) \
`define RESET_RELAY_EX(dst, src, size, fanout) \
wire [size-1:0] dst; \
VX_reset_relay #(.N(size), .MAX_FANOUT(fanout)) __``dst ( \
.clk (clk), \
.reset (src), \
.reset_o (dst) \
)
`define POP_COUNT(out, in) \
VX_popcount #( \
.N ($bits(in)) \
) __``out ( \
.in_i (in), \
.cnt_o (out) \
)
`define RESET_RELAY_EN(dst, src, enable) \
`RESET_RELAY_EX (dst, src, 1, ((enable) ? 0 : -1))
`endif
`define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
`define OUT_REG_TO_EB_SIZE(out_reg) `MIN(out_reg, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
`define OUT_REG_TO_EB_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)
`define _REPEAT_1(f,s) `f(0)
`define _REPEAT_2(f,s) `f(1) `s `_REPEAT_1(f,s)
`define _REPEAT_3(f,s) `f(2) `s `_REPEAT_2(f,s)
`define _REPEAT_4(f,s) `f(3) `s `_REPEAT_3(f,s)
`define _REPEAT_5(f,s) `f(4) `s `_REPEAT_4(f,s)
`define _REPEAT_6(f,s) `f(5) `s `_REPEAT_5(f,s)
`define _REPEAT_7(f,s) `f(6) `s `_REPEAT_6(f,s)
`define _REPEAT_8(f,s) `f(7) `s `_REPEAT_7(f,s)
`define _REPEAT_9(f,s) `f(8) `s `_REPEAT_8(f,s)
`define _REPEAT_10(f,s) `f(9) `s `_REPEAT_9(f,s)
`define _REPEAT_11(f,s) `f(10) `s `_REPEAT_10(f,s)
`define _REPEAT_12(f,s) `f(11) `s `_REPEAT_11(f,s)
`define _REPEAT_13(f,s) `f(12) `s `_REPEAT_12(f,s)
`define _REPEAT_14(f,s) `f(13) `s `_REPEAT_13(f,s)
`define _REPEAT_15(f,s) `f(14) `s `_REPEAT_14(f,s)
`define _REPEAT_16(f,s) `f(15) `s `_REPEAT_15(f,s)
`define _REPEAT_17(f,s) `f(16) `s `_REPEAT_16(f,s)
`define _REPEAT_18(f,s) `f(17) `s `_REPEAT_17(f,s)
`define _REPEAT_19(f,s) `f(18) `s `_REPEAT_18(f,s)
`define _REPEAT_20(f,s) `f(19) `s `_REPEAT_19(f,s)
`define _REPEAT_21(f,s) `f(20) `s `_REPEAT_20(f,s)
`define _REPEAT_22(f,s) `f(21) `s `_REPEAT_21(f,s)
`define _REPEAT_23(f,s) `f(22) `s `_REPEAT_22(f,s)
`define _REPEAT_24(f,s) `f(23) `s `_REPEAT_23(f,s)
`define _REPEAT_25(f,s) `f(24) `s `_REPEAT_24(f,s)
`define _REPEAT_26(f,s) `f(25) `s `_REPEAT_25(f,s)
`define _REPEAT_27(f,s) `f(26) `s `_REPEAT_26(f,s)
`define _REPEAT_28(f,s) `f(27) `s `_REPEAT_27(f,s)
`define _REPEAT_29(f,s) `f(28) `s `_REPEAT_28(f,s)
`define _REPEAT_30(f,s) `f(29) `s `_REPEAT_29(f,s)
`define _REPEAT_31(f,s) `f(30) `s `_REPEAT_30(f,s)
`define _REPEAT_32(f,s) `f(31) `s `_REPEAT_31(f,s)
`define REPEAT_COMMA ,
`define REPEAT_SEMICOLON ;
`endif // VX_PLATFORM_VH

View File

@@ -1,89 +1,68 @@
`ifndef VX_SCOPE
`define VX_SCOPE
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_SCOPE_VH
`define VX_SCOPE_VH
`ifdef SCOPE
`include "scope-defs.vh"
`define SCOPE_IO_DECL \
input wire scope_reset, \
input wire scope_bus_in, \
output wire scope_bus_out,
`define SCOPE_ASSIGN(d,s) assign scope_``d = s
`define SCOPE_IO_SWITCH(__count) \
wire scope_bus_in_w [__count]; \
wire scope_bus_out_w [__count]; \
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
);
`define SCOPE_SIZE 1024
`define SCOPE_IO_BIND(__i) \
.scope_reset (scope_reset_w[__i]), \
.scope_bus_in (scope_bus_in_w[__i]), \
.scope_bus_out (scope_bus_out_w[__i]),
`define SCOPE_IO_UNUSED() \
`UNUSED_VAR (scope_reset); \
`UNUSED_VAR (scope_bus_in); \
assign scope_bus_out = 0;
`define SCOPE_IO_UNUSED_W(__i) \
`UNUSED_VAR (scope_reset_w[__i]); \
`UNUSED_VAR (scope_bus_in_w[__i]); \
assign scope_bus_out_w[__i] = 0;
`else
`define SCOPE_IO_VX_icache_stage
`define SCOPE_IO_DECL
`define SCOPE_IO_VX_fetch
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_BIND_VX_fetch_icache_stage
`define SCOPE_IO_BIND(__i)
`define SCOPE_BIND_VX_fetch_warp_sched
`define SCOPE_IO_UNUSED_W(__i)
`define SCOPE_IO_VX_warp_sched
`define SCOPE_IO_VX_pipeline
`define SCOPE_BIND_VX_pipeline_fetch
`define SCOPE_IO_VX_core
`define SCOPE_BIND_VX_core_pipeline
`define SCOPE_IO_VX_cluster
`define SCOPE_BIND_VX_cluster_core(__i__)
`define SCOPE_IO_Vortex
`define SCOPE_BIND_Vortex_cluster(__i__)
`define SCOPE_BIND_afu_vortex
`define SCOPE_IO_VX_lsu_unit
`define SCOPE_IO_VX_gpu_unit
`define SCOPE_IO_VX_execute
`define SCOPE_BIND_VX_execute_lsu_unit
`define SCOPE_BIND_VX_execute_gpu_unit
`define SCOPE_BIND_VX_pipeline_execute
`define SCOPE_IO_VX_issue
`define SCOPE_BIND_VX_pipeline_issue
`define SCOPE_IO_VX_bank
`define SCOPE_IO_VX_cache
`define SCOPE_BIND_VX_cache_bank(__i__)
`define SCOPE_BIND_Vortex_l3cache
`define SCOPE_BIND_VX_cluster_l2cache
`define SCOPE_IO_VX_mem_unit
`define SCOPE_BIND_VX_mem_unit_dcache
`define SCOPE_BIND_VX_core_mem_unit
`define SCOPE_BIND_VX_mem_unit_icache
`define SCOPE_BIND_VX_mem_unit_smem
`define SCOPE_DECL_SIGNALS
`define SCOPE_DATA_LIST
`define SCOPE_UPDATE_LIST
`define SCOPE_TRIGGER
`define SCOPE_ASSIGN(d,s)
`define SCOPE_IO_UNUSED(__i)
`endif
`endif
`endif // VX_SCOPE_VH

View File

@@ -1,85 +0,0 @@
`include "VX_define.vh"
module VX_scoreboard #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_ibuffer_if.slave ibuffer_if,
VX_writeback_if.slave writeback_if
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
always @(*) begin
inuse_regs_n = inuse_regs;
if (reserve_reg) begin
inuse_regs_n[ibuffer_if.wid][ibuffer_if.rd] = 1;
end
if (release_reg) begin
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
end else begin
inuse_regs <= inuse_regs_n;
end
end
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
always @(posedge clk) begin
deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n];
deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n];
deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n];
deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n];
end
assign writeback_if.ready = 1'b1;
assign ibuffer_if.ready = ~(deq_inuse_rd
| deq_inuse_rs1
| deq_inuse_rs2
| deq_inuse_rs3);
`UNUSED_VAR (writeback_if.PC)
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin
if (reset) begin
deadlock_ctr <= 0;
end else begin
`ifdef DBG_TRACE_CORE_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid);
end
`endif
if (release_reg) begin
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid));
end
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
deadlock_ctr <= deadlock_ctr + 1;
`ASSERT(deadlock_ctr < deadlock_timeout,
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid));
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
deadlock_ctr <= 0;
end
end
end
endmodule

View File

@@ -1,160 +0,0 @@
`include "VX_define.vh"
module VX_smem_arb #(
parameter NUM_REQS = 1,
parameter LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "P",
parameter ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
parameter DATA_WIDTH = (8 * DATA_SIZE),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH - LOG_NUM_REQS
) (
input wire clk,
input wire reset,
// input request
input wire [LANES-1:0] req_valid_in,
input wire [LANES-1:0] req_rw_in,
input wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
input wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
input wire [LANES-1:0][DATA_WIDTH-1:0] req_data_in,
input wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
output wire [LANES-1:0] req_ready_in,
// output requests
output wire [NUM_REQS-1:0][LANES-1:0] req_valid_out,
output wire [NUM_REQS-1:0][LANES-1:0] req_rw_out,
output wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
output wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_out,
output wire [NUM_REQS-1:0][LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
input wire [NUM_REQS-1:0][LANES-1:0] req_ready_out,
// input responses
input wire [NUM_REQS-1:0] rsp_valid_in,
input wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
input wire [NUM_REQS-1:0][TAG_OUT_WIDTH-1:0] rsp_tag_in,
output wire [NUM_REQS-1:0] rsp_ready_in,
// output response
output wire rsp_valid_out,
output wire [LANES-1:0] rsp_tmask_out,
output wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
output wire [TAG_IN_WIDTH-1:0] rsp_tag_out,
input wire rsp_ready_out
);
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
if (NUM_REQS > 1) begin
wire [LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
wire [LANES-1:0][LOG_NUM_REQS-1:0] req_sel;
wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_in_w;
for (genvar i = 0; i < LANES; ++i) begin
assign req_sel[i] = req_tag_in[i][TAG_SEL_IDX +: LOG_NUM_REQS];
VX_bits_remove #(
.N (TAG_IN_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (req_tag_in[i]),
.data_out (req_tag_in_w[i])
);
assign req_data_in_merged[i] = {req_tag_in_w[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]};
end
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.LANES (LANES),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ)
) req_demux (
.clk (clk),
.reset (reset),
.sel_in (req_sel),
.valid_in (req_valid_in),
.data_in (req_data_in_merged),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_data_out_merged),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < NUM_REQS; i++) begin
for (genvar j = 0; j < LANES; ++j) begin
assign {req_tag_out[i][j], req_addr_out[i][j], req_rw_out[i][j], req_byteen_out[i][j], req_data_out[i][j]} = req_data_out_merged[i][j];
end
end
///////////////////////////////////////////////////////////////////////
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in_merged;
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
VX_bits_insert #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (rsp_tag_in[i]),
.sel_in (LOG_NUM_REQS'(i)),
.data_out (rsp_tag_in_w)
);
assign rsp_data_in_merged[i] = {rsp_tag_in_w, rsp_tmask_in[i], rsp_data_in[i]};
end
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.LANES (1),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP),
.TYPE (TYPE)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_in),
.data_in (rsp_data_in_merged),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out ({rsp_tag_out, rsp_tmask_out, rsp_data_out}),
.ready_out (rsp_ready_out)
);
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign req_valid_out = req_valid_in;
assign req_tag_out = req_tag_in;
assign req_addr_out = req_addr_in;
assign req_rw_out = req_rw_in;
assign req_byteen_out = req_byteen_in;
assign req_data_out = req_data_in;
assign req_ready_in = req_ready_out;
assign rsp_valid_out = rsp_valid_in;
assign rsp_tmask_out = rsp_tmask_in;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
end
endmodule

187
hw/rtl/VX_socket.sv Normal file
View File

@@ -0,0 +1,187 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
`RESET_RELAY (gbar_arb_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE),
.OUT_REG ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb (
.clk (clk),
.reset (gbar_arb_reset),
.bus_in_if (per_core_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
`endif
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
`RESET_RELAY (dcache_arb_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
) dcache_bus_tmp_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_tmp_if[`SOCKET_SIZE]();
for (genvar j = 0; j < `SOCKET_SIZE; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (per_core_dcache_bus_tmp_if[j], per_core_dcache_bus_if[j * DCACHE_NUM_REQS + i]);
end
VX_mem_arb #(
.NUM_INPUTS (`SOCKET_SIZE),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
.TAG_SEL_IDX (`CACHE_ADDR_TYPE_BITS),
.ARBITER ("R"),
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
) dcache_arb (
.clk (clk),
.reset (dcache_arb_reset),
.bus_in_if (per_core_dcache_bus_tmp_if),
.bus_out_if (dcache_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[0]);
end
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
) icache_bus_tmp_if[1]();
`RESET_RELAY (icache_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (`SOCKET_SIZE),
.NUM_OUTPUTS (1),
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
) icache_arb (
.clk (clk),
.reset (icache_arb_reset),
.bus_in_if (per_core_icache_bus_if),
.bus_out_if (icache_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (icache_bus_if, icache_bus_tmp_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value;
assign sim_ebreak = per_core_sim_ebreak[0];
assign sim_wb_value = per_core_sim_wb_value[0];
`UNUSED_VAR (per_core_sim_ebreak)
`UNUSED_VAR (per_core_sim_wb_value)
wire [`SOCKET_SIZE-1:0] per_core_busy;
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
// Generate all cores
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin
`RESET_RELAY (core_reset, reset);
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i)
) core (
`SCOPE_IO_BIND (i)
.clk (clk),
.reset (core_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (core_dcr_bus_if),
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_core_icache_bus_if[i]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_core_gbar_bus_if[i]),
`endif
.sim_ebreak (per_core_sim_ebreak[i]),
.sim_wb_value (per_core_sim_wb_value[i]),
.busy (per_core_busy[i])
);
end
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1));
endmodule

View File

@@ -1,148 +0,0 @@
`ifndef VX_TRACE_INSTR
`define VX_TRACE_INSTR
`include "VX_define.vh"
task trace_ex_type (
input [`EX_BITS-1:0] ex_type
);
case (ex_type)
`EX_ALU: dpi_trace("ALU");
`EX_LSU: dpi_trace("LSU");
`EX_CSR: dpi_trace("CSR");
`EX_FPU: dpi_trace("FPU");
`EX_GPU: dpi_trace("GPU");
default: dpi_trace("NOP");
endcase
endtask
task trace_ex_op (
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod
);
case (ex_type)
`EX_ALU: begin
if (`INST_ALU_IS_BR(op_mod)) begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: dpi_trace("BEQ");
`INST_BR_NE: dpi_trace("BNE");
`INST_BR_LT: dpi_trace("BLT");
`INST_BR_GE: dpi_trace("BGE");
`INST_BR_LTU: dpi_trace("BLTU");
`INST_BR_GEU: dpi_trace("BGEU");
`INST_BR_JAL: dpi_trace("JAL");
`INST_BR_JALR: dpi_trace("JALR");
`INST_BR_ECALL: dpi_trace("ECALL");
`INST_BR_EBREAK:dpi_trace("EBREAK");
`INST_BR_URET: dpi_trace("URET");
`INST_BR_SRET: dpi_trace("SRET");
`INST_BR_MRET: dpi_trace("MRET");
default: dpi_trace("?");
endcase
end else if (`INST_ALU_IS_MUL(op_mod)) begin
case (`INST_MUL_BITS'(op_type))
`INST_MUL_MUL: dpi_trace("MUL");
`INST_MUL_MULH: dpi_trace("MULH");
`INST_MUL_MULHSU:dpi_trace("MULHSU");
`INST_MUL_MULHU: dpi_trace("MULHU");
`INST_MUL_DIV: dpi_trace("DIV");
`INST_MUL_DIVU: dpi_trace("DIVU");
`INST_MUL_REM: dpi_trace("REM");
`INST_MUL_REMU: dpi_trace("REMU");
default: dpi_trace("?");
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: dpi_trace("ADD");
`INST_ALU_SUB: dpi_trace("SUB");
`INST_ALU_SLL: dpi_trace("SLL");
`INST_ALU_SRL: dpi_trace("SRL");
`INST_ALU_SRA: dpi_trace("SRA");
`INST_ALU_SLT: dpi_trace("SLT");
`INST_ALU_SLTU: dpi_trace("SLTU");
`INST_ALU_XOR: dpi_trace("XOR");
`INST_ALU_OR: dpi_trace("OR");
`INST_ALU_AND: dpi_trace("AND");
`INST_ALU_LUI: dpi_trace("LUI");
`INST_ALU_AUIPC: dpi_trace("AUIPC");
default: dpi_trace("?");
endcase
end
end
`EX_LSU: begin
if (op_mod == 0) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: dpi_trace("LB");
`INST_LSU_LH: dpi_trace("LH");
`INST_LSU_LW: dpi_trace("LW");
`INST_LSU_LBU:dpi_trace("LBU");
`INST_LSU_LHU:dpi_trace("LHU");
`INST_LSU_SB: dpi_trace("SB");
`INST_LSU_SH: dpi_trace("SH");
`INST_LSU_SW: dpi_trace("SW");
default: dpi_trace("?");
endcase
end else if (op_mod == 1) begin
case (`INST_FENCE_BITS'(op_type))
`INST_FENCE_D: dpi_trace("DFENCE");
`INST_FENCE_I: dpi_trace("IFENCE");
default: dpi_trace("?");
endcase
end
end
`EX_CSR: begin
case (`INST_CSR_BITS'(op_type))
`INST_CSR_RW: dpi_trace("CSRW");
`INST_CSR_RS: dpi_trace("CSRS");
`INST_CSR_RC: dpi_trace("CSRC");
default: dpi_trace("?");
endcase
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: dpi_trace("ADD");
`INST_FPU_SUB: dpi_trace("SUB");
`INST_FPU_MUL: dpi_trace("MUL");
`INST_FPU_DIV: dpi_trace("DIV");
`INST_FPU_SQRT: dpi_trace("SQRT");
`INST_FPU_MADD: dpi_trace("MADD");
`INST_FPU_NMSUB: dpi_trace("NMSUB");
`INST_FPU_NMADD: dpi_trace("NMADD");
`INST_FPU_CVTWS: dpi_trace("CVTWS");
`INST_FPU_CVTWUS:dpi_trace("CVTWUS");
`INST_FPU_CVTSW: dpi_trace("CVTSW");
`INST_FPU_CVTSWU:dpi_trace("CVTSWU");
`INST_FPU_CLASS: dpi_trace("CLASS");
`INST_FPU_CMP: dpi_trace("CMP");
`INST_FPU_MISC: begin
case (op_mod)
0: dpi_trace("SGNJ");
1: dpi_trace("SGNJN");
2: dpi_trace("SGNJX");
3: dpi_trace("MIN");
4: dpi_trace("MAX");
5: dpi_trace("MVXW");
6: dpi_trace("MVWX");
endcase
end
default: dpi_trace("?");
endcase
end
`EX_GPU: begin
case (`INST_GPU_BITS'(op_type))
`INST_GPU_TMC: dpi_trace("TMC");
`INST_GPU_WSPAWN:dpi_trace("WSPAWN");
`INST_GPU_SPLIT: dpi_trace("SPLIT");
`INST_GPU_JOIN: dpi_trace("JOIN");
`INST_GPU_BAR: dpi_trace("BAR");
`INST_GPU_PRED: dpi_trace("PRED");
`INST_GPU_TEX: dpi_trace("TEX");
default: dpi_trace("?");
endcase
end
default: dpi_trace("?");
endcase
endtask
`endif

177
hw/rtl/VX_types.vh Normal file
View File

@@ -0,0 +1,177 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TYPES_VH
`define VX_TYPES_VH
// Device configuration registers
`define VX_CSR_ADDR_BITS 12
`define VX_DCR_ADDR_BITS 12
`define VX_DCR_BASE_STATE_BEGIN 12'h001
`define VX_DCR_BASE_STARTUP_ADDR0 12'h001
`define VX_DCR_BASE_STARTUP_ADDR1 12'h002
`define VX_DCR_BASE_MPM_CLASS 12'h003
`define VX_DCR_BASE_STATE_END 12'h004
`define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN)
`define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2
`define VX_DCR_MPM_CLASS_TEX 3
`define VX_DCR_MPM_CLASS_RASTER 4
`define VX_DCR_MPM_CLASS_ROP 5
// User Floating-Point CSRs
`define VX_CSR_FFLAGS 12'h001
`define VX_CSR_FRM 12'h002
`define VX_CSR_FCSR 12'h003
`define VX_CSR_SATP 12'h180
`define VX_CSR_PMPCFG0 12'h3A0
`define VX_CSR_PMPADDR0 12'h3B0
`define VX_CSR_MSTATUS 12'h300
`define VX_CSR_MISA 12'h301
`define VX_CSR_MEDELEG 12'h302
`define VX_CSR_MIDELEG 12'h303
`define VX_CSR_MIE 12'h304
`define VX_CSR_MTVEC 12'h305
`define VX_CSR_MEPC 12'h341
`define VX_CSR_MNSTATUS 12'h744
`define VX_CSR_MPM_BASE 12'hB00
`define VX_CSR_MPM_BASE_H 12'hB80
// Machine Performance-monitoring core counters
// PERF: Standard
`define VX_CSR_MCYCLE 12'hB00
`define VX_CSR_MCYCLE_H 12'hB80
`define VX_CSR_MPM_RESERVED 12'hB01
`define VX_CSR_MPM_RESERVED_H 12'hB81
`define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82
// PERF: pipeline
`define VX_CSR_MPM_IBUF_ST 12'hB03
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
`define VX_CSR_MPM_SCRB_ST 12'hB04
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
`define VX_CSR_MPM_ALU_ST 12'hB05
`define VX_CSR_MPM_ALU_ST_H 12'hB85
`define VX_CSR_MPM_LSU_ST 12'hB06
`define VX_CSR_MPM_LSU_ST_H 12'hB86
`define VX_CSR_MPM_FPU_ST 12'hB07
`define VX_CSR_MPM_FPU_ST_H 12'hB87
`define VX_CSR_MPM_SFU_ST 12'hB08
`define VX_CSR_MPM_SFU_ST_H 12'hB88
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0A
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
`define VX_CSR_MPM_LOADS 12'hB0B
`define VX_CSR_MPM_LOADS_H 12'hB8B
`define VX_CSR_MPM_STORES 12'hB0C
`define VX_CSR_MPM_STORES_H 12'hB8C
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
// Machine Performance-monitoring memory counters
// PERF: icache
`define VX_CSR_MPM_ICACHE_READS 12'hB03 // total reads
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
// PERF: dcache
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
// PERF: l2cache
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
// PERF: l3cache
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
// PERF: memory
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
// Machine Information Registers
`define VX_CSR_MVENDORID 12'hF11
`define VX_CSR_MARCHID 12'hF12
`define VX_CSR_MIMPID 12'hF13
`define VX_CSR_MHARTID 12'hF14
// GPGU CSRs
`define VX_CSR_THREAD_ID 12'hCC0
`define VX_CSR_WARP_ID 12'hCC1
`define VX_CSR_CORE_ID 12'hCC2
`define VX_CSR_WARP_MASK 12'hCC3
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_NUM_THREADS 12'hFC0
`define VX_CSR_NUM_WARPS 12'hFC1
`define VX_CSR_NUM_CORES 12'hFC2
`endif // VX_TYPES_VH

View File

@@ -1,254 +0,0 @@
`include "VX_define.vh"
module VX_warp_sched #(
parameter CORE_ID = 0
) (
`SCOPE_IO_VX_warp_sched
input wire clk,
input wire reset,
VX_warp_ctl_if.slave warp_ctl_if,
VX_wstall_if.slave wstall_if,
VX_join_if.slave join_if,
VX_branch_ctl_if.slave branch_ctl_if,
VX_ifetch_req_if.master ifetch_req_if,
VX_fetch_to_csr_if.master fetch_to_csr_if,
output wire busy
);
`UNUSED_PARAM (CORE_ID)
wire join_else;
wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tmask;
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks;
reg [`NUM_WARPS-1:0][31:0] warp_pcs;
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks; // warps waiting on barrier
wire reached_barrier_limit; // the expected number of warps reached the barrier
// wspawn
reg [31:0] wspawn_pc;
reg [`NUM_WARPS-1:0] use_wspawn;
wire [`NW_BITS-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [31:0] schedule_pc;
wire schedule_valid;
wire warp_scheduled;
reg [`UUID_BITS-1:0] issued_instrs;
wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready;
wire tmc_active = (warp_ctl_if.tmc.tmask != 0);
always @(*) begin
active_warps_n = active_warps;
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n = warp_ctl_if.wspawn.wmask;
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = tmc_active;
end
end
always @(posedge clk) begin
if (reset) begin
barrier_masks <= '0;
use_wspawn <= '0;
stalled_warps <= '0;
warp_pcs <= '0;
active_warps <= '0;
thread_masks <= '0;
issued_instrs <= '0;
// activate first warp
warp_pcs[0] <= `STARTUP_ADDR;
active_warps[0] <= 1;
thread_masks[0] <= 1;
end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
wspawn_pc <= warp_ctl_if.wspawn.pc;
end
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (reached_barrier_limit) begin
barrier_masks[warp_ctl_if.barrier.id] <= 0;
end else begin
barrier_masks[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1;
end
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
stalled_warps[warp_ctl_if.wid] <= 0;
end
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (warp_ctl_if.split.diverged) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
end
end
// Branch
if (branch_ctl_if.valid) begin
if (branch_ctl_if.taken) begin
warp_pcs[branch_ctl_if.wid] <= branch_ctl_if.dest;
end
stalled_warps[branch_ctl_if.wid] <= 0;
end
if (warp_scheduled) begin
// stall the warp until decode stage
stalled_warps[schedule_wid] <= 1;
// release wspawn
use_wspawn[schedule_wid] <= 0;
if (use_wspawn[schedule_wid]) begin
thread_masks[schedule_wid] <= 1;
end
issued_instrs <= issued_instrs + 1;
end
if (ifetch_req_fire) begin
warp_pcs[ifetch_req_if.wid] <= ifetch_req_if.PC + 4;
end
if (wstall_if.valid) begin
stalled_warps[wstall_if.wid] <= wstall_if.stalled;
end
// join handling
if (join_if.valid) begin
if (join_else) begin
warp_pcs[join_if.wid] <= join_pc;
end
thread_masks[join_if.wid] <= join_tmask;
end
active_warps <= active_warps_n;
end
end
// export thread mask register
assign fetch_to_csr_if.thread_masks = thread_masks;
// calculate active barrier status
`IGNORE_UNUSED_BEGIN
wire [`NW_BITS:0] active_barrier_count;
`IGNORE_UNUSED_END
wire [`NUM_WARPS-1:0] barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, barrier_mask);
assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);
reg [`NUM_WARPS-1:0] barrier_stalls;
always @(*) begin
barrier_stalls = barrier_masks[0];
for (integer i = 1; i < `NUM_BARRIERS; ++i) begin
barrier_stalls |= barrier_masks[i];
end
end
// split/join stack management
wire [(32+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire ipdom_index [`NUM_WARPS-1:0];
for (genvar i = 0; i < `NUM_WARPS; i++) begin
wire push = warp_ctl_if.valid
&& warp_ctl_if.split.valid
&& (i == warp_ctl_if.wid);
wire pop = join_if.valid && (i == join_if.wid);
wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
wire [(32+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
wire [(32+`NUM_THREADS)-1:0] q_end = {32'b0, orig_tmask};
VX_ipdom_stack #(
.WIDTH (32+`NUM_THREADS),
.DEPTH (2 ** (`NT_BITS+1))
) ipdom_stack (
.clk (clk),
.reset (reset),
.push (push),
.pop (pop),
.pair (warp_ctl_if.split.diverged),
.q1 (q_end),
.q2 (q_else),
.d (ipdom_data[i]),
.index (ipdom_index[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
assign {join_pc, join_tmask} = ipdom_data[join_if.wid];
assign join_else = ~ipdom_index[join_if.wid];
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS)
) wid_select (
.in_i (ready_warps),
.cnt_o (schedule_wid),
.valid_o (schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + 32)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {(use_wspawn[i] ? `NUM_THREADS'(1) : thread_masks[i]),
(use_wspawn[i] ? wspawn_pc : warp_pcs[i])};
end
assign {schedule_tmask, schedule_pc} = schedule_data[schedule_wid];
wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
assign warp_scheduled = schedule_valid && ~stall_out;
wire [`UUID_BITS-1:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + `UUID_BITS'(CORE_ID);
VX_pipe_register #(
.DATAW (1 + `UUID_BITS + `NUM_THREADS + 32 + `NW_BITS),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
);
assign busy = (active_warps != 0);
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
`SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid);
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
`SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps);
`SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid);
`SCOPE_ASSIGN (wsched_schedule_tmask, schedule_tmask);
`SCOPE_ASSIGN (wsched_schedule_pc, schedule_pc);
endmodule

View File

@@ -1,113 +0,0 @@
`include "VX_define.vh"
module VX_writeback #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if,
VX_commit_if.slave ld_commit_if,
VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if,
`endif
VX_commit_if.slave gpu_commit_if,
// outputs
VX_writeback_if.master writeback_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
`ifdef EXT_F_ENABLE
localparam NUM_RSPS = 5;
`else
localparam NUM_RSPS = 4;
`endif
wire wb_valid;
wire [`NW_BITS-1:0] wb_wid;
wire [31:0] wb_PC;
wire [`NUM_THREADS-1:0] wb_tmask;
wire [`NR_BITS-1:0] wb_rd;
wire [`NUM_THREADS-1:0][31:0] wb_data;
wire wb_eop;
wire [NUM_RSPS-1:0] rsp_valid;
wire [NUM_RSPS-1:0][DATAW-1:0] rsp_data;
wire [NUM_RSPS-1:0] rsp_ready;
wire stall;
assign rsp_valid = {
gpu_commit_if.valid && gpu_commit_if.wb,
csr_commit_if.valid && csr_commit_if.wb,
alu_commit_if.valid && alu_commit_if.wb,
`ifdef EXT_F_ENABLE
fpu_commit_if.valid && fpu_commit_if.wb,
`endif
ld_commit_if.valid && ld_commit_if.wb
};
assign rsp_data = {
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
`ifdef EXT_F_ENABLE
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
`endif
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}
};
VX_stream_arbiter #(
.NUM_REQS (NUM_RSPS),
.DATAW (DATAW),
.BUFFERED (1),
.TYPE ("R")
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid),
.data_in (rsp_data),
.ready_in (rsp_ready),
.valid_out (wb_valid),
.data_out ({wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.ready_out (~stall)
);
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
`ifdef EXT_F_ENABLE
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb;
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
`else
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
assign stall = ~writeback_if.ready && writeback_if.valid;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
.data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data, writeback_if.eop})
);
// special workaround to get RISC-V tests Pass/Fail status
reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */;
always @(posedge clk) begin
if (writeback_if.valid && writeback_if.ready) begin
last_wb_value[writeback_if.rd] <= writeback_if.data[0];
end
end
endmodule

View File

@@ -1,7 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module Vortex (
`SCOPE_IO_Vortex
module Vortex import VX_gpu_pkg::*; (
`SCOPE_IO_DECL
// Clock
input wire clk,
@@ -22,204 +35,224 @@ module Vortex (
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// DCR write request
input wire dcr_wr_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
// Status
output wire busy
);
`STATIC_ASSERT((`L3_ENABLE == 0 || `NUM_CLUSTERS > 1), ("invalid parameter"))
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_req_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if[`NUM_CLUSTERS]();
VX_mem_perf_if perf_memsys_total_if();
VX_cache_perf_if perf_l3cache_if();
`endif
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
VX_mem_bus_if #(
.DATA_SIZE (`L3_LINE_SIZE),
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
) mem_bus_if();
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen= mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
`RESET_RELAY (cluster_reset);
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire sim_ebreak /* verilator public */;
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
assign sim_ebreak = per_cluster_sim_ebreak[0];
assign sim_wb_value = per_cluster_sim_wb_value[0];
`UNUSED_VAR (per_cluster_sim_ebreak)
`UNUSED_VAR (per_cluster_sim_wb_value)
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
assign dcr_bus_if.write_addr = dcr_wr_addr;
assign dcr_bus_if.write_data = dcr_wr_data;
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
`RESET_RELAY (cluster_reset, reset);
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #(
.CLUSTER_ID(i)
.CLUSTER_ID (i)
) cluster (
`SCOPE_BIND_Vortex_cluster(i)
`SCOPE_IO_BIND (i)
.clk (clk),
.reset (cluster_reset),
.mem_req_valid (per_cluster_mem_req_valid [i]),
.mem_req_rw (per_cluster_mem_req_rw [i]),
.mem_req_byteen (per_cluster_mem_req_byteen[i]),
.mem_req_addr (per_cluster_mem_req_addr [i]),
.mem_req_data (per_cluster_mem_req_data [i]),
.mem_req_tag (per_cluster_mem_req_tag [i]),
.mem_req_ready (per_cluster_mem_req_ready [i]),
.mem_rsp_valid (per_cluster_mem_rsp_valid [i]),
.mem_rsp_data (per_cluster_mem_rsp_data [i]),
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
.busy (per_cluster_busy [i])
);
end
assign busy = (| per_cluster_busy);
if (`L3_ENABLE) begin
`ifdef PERF_ENABLE
VX_perf_cache_if perf_l3cache_if();
`endif
`RESET_RELAY (l3_reset);
VX_cache #(
.CACHE_ID (`L3_CACHE_ID),
.CACHE_SIZE (`L3_CACHE_SIZE),
.CACHE_LINE_SIZE (`L3_CACHE_LINE_SIZE),
.NUM_BANKS (`L3_NUM_BANKS),
.NUM_PORTS (`L3_NUM_PORTS),
.WORD_SIZE (`L3_WORD_SIZE),
.NUM_REQS (`L3_NUM_REQS),
.CREQ_SIZE (`L3_CREQ_SIZE),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.MEM_TAG_WIDTH (`L3_MEM_TAG_WIDTH),
.NC_ENABLE (1)
) l3cache (
`SCOPE_BIND_Vortex_l3cache
.clk (clk),
.reset (l3_reset),
.reset (cluster_reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_l3cache_if),
.mem_perf_if (mem_perf_if[i]),
.perf_memsys_total_if (perf_memsys_total_if),
`endif
// Core request
.core_req_valid (per_cluster_mem_req_valid),
.core_req_rw (per_cluster_mem_req_rw),
.core_req_byteen (per_cluster_mem_req_byteen),
.core_req_addr (per_cluster_mem_req_addr),
.core_req_data (per_cluster_mem_req_data),
.core_req_tag (per_cluster_mem_req_tag),
.core_req_ready (per_cluster_mem_req_ready),
// Core response
.core_rsp_valid (per_cluster_mem_rsp_valid),
.core_rsp_data (per_cluster_mem_rsp_data),
.core_rsp_tag (per_cluster_mem_rsp_tag),
.core_rsp_ready (per_cluster_mem_rsp_ready),
`UNUSED_PIN (core_rsp_tmask),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready)
);
end else begin
`RESET_RELAY (mem_arb_reset);
VX_mem_arb #(
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3_MEM_DATA_WIDTH),
.ADDR_WIDTH (`L3_MEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`L2_MEM_TAG_WIDTH),
.TYPE ("R"),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
// Core request
.req_valid_in (per_cluster_mem_req_valid),
.req_rw_in (per_cluster_mem_req_rw),
.req_byteen_in (per_cluster_mem_req_byteen),
.req_addr_in (per_cluster_mem_req_addr),
.req_data_in (per_cluster_mem_req_data),
.req_tag_in (per_cluster_mem_req_tag),
.req_ready_in (per_cluster_mem_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Core response
.rsp_valid_out (per_cluster_mem_rsp_valid),
.rsp_data_out (per_cluster_mem_rsp_data),
.rsp_tag_out (per_cluster_mem_rsp_tag),
.rsp_ready_out (per_cluster_mem_rsp_ready),
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]),
.sim_ebreak (per_cluster_sim_ebreak[i]),
.sim_wb_value (per_cluster_sim_wb_value[i]),
.busy (per_cluster_busy[i])
);
end
`SCOPE_ASSIGN (reset, reset);
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
`SCOPE_ASSIGN (mem_req_rw, mem_req_rw);
`SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen);
`SCOPE_ASSIGN (mem_req_data, mem_req_data);
`SCOPE_ASSIGN (mem_req_tag, mem_req_tag);
`SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready);
`SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data);
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
`SCOPE_ASSIGN (busy, busy);
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
`RESET_RELAY (l3_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ("l3cache"),
.CACHE_SIZE (`L3_CACHE_SIZE),
.LINE_SIZE (`L3_LINE_SIZE),
.NUM_BANKS (`L3_NUM_BANKS),
.NUM_WAYS (`L3_NUM_WAYS),
.WORD_SIZE (L3_WORD_SIZE),
.NUM_REQS (L3_NUM_REQS),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
.MEM_OUT_REG (2),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
.clk (clk),
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf_if (perf_l3cache_if),
`endif
.core_bus_if (per_cluster_mem_bus_if),
.mem_bus_if (mem_bus_if)
);
`ifdef PERF_ENABLE
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
`ifdef L3_ENABLE
assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads;
assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes;
assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses;
assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses;
assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls;
assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls;
`else
assign perf_memsys_total_if.l3cache_reads = '0;
assign perf_memsys_total_if.l3cache_writes = '0;
assign perf_memsys_total_if.l3cache_read_misses = '0;
assign perf_memsys_total_if.l3cache_write_misses= '0;
assign perf_memsys_total_if.l3cache_bank_stalls = '0;
assign perf_memsys_total_if.l3cache_mshr_stalls = '0;
`endif
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
end
end
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
always @(posedge clk) begin
if (reset) begin
perf_mem_reads <= '0;
perf_mem_writes <= '0;
perf_mem_lat <= '0;
end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
end
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
end
end
assign perf_memsys_total_if.mem_reads = perf_mem_reads;
assign perf_memsys_total_if.mem_writes = perf_mem_writes;
assign perf_memsys_total_if.mem_latency = perf_mem_lat;
`endif
`ifdef DBG_TRACE_CORE_MEM
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_fire) begin
if (mem_req_rw)
dpi_trace("%d: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
dpi_trace("%d: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_valid && mem_rsp_ready) begin
dpi_trace("%d: MEM Rsp: tag=%0h, data=%0h\n", $time, mem_rsp_tag, mem_rsp_data);
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
end
end
`endif
`ifndef NDEBUG
`ifdef SIMULATION
always @(posedge clk) begin
$fflush(); // flush stdout buffer
end
`endif
endmodule
endmodule

View File

@@ -1,65 +1,91 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module Vortex_axi #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = 32,
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_STROBE_WIDTH = (`VX_MEM_DATA_WIDTH / 8)
module Vortex_axi import VX_gpu_pkg::*; #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `XLEN,
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_NUM_BANKS = 1
)(
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
// AXI write request address channel
output wire [AXI_TID_WIDTH-1:0] m_axi_awid,
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr,
output wire [7:0] m_axi_awlen,
output wire [2:0] m_axi_awsize,
output wire [1:0] m_axi_awburst,
output wire m_axi_awlock,
output wire [3:0] m_axi_awcache,
output wire [2:0] m_axi_awprot,
output wire [3:0] m_axi_awqos,
output wire m_axi_awvalid,
input wire m_axi_awready,
output wire m_axi_awvalid [AXI_NUM_BANKS],
input wire m_axi_awready [AXI_NUM_BANKS],
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr [AXI_NUM_BANKS],
output wire [AXI_TID_WIDTH-1:0] m_axi_awid [AXI_NUM_BANKS],
output wire [7:0] m_axi_awlen [AXI_NUM_BANKS],
output wire [2:0] m_axi_awsize [AXI_NUM_BANKS],
output wire [1:0] m_axi_awburst [AXI_NUM_BANKS],
output wire [1:0] m_axi_awlock [AXI_NUM_BANKS],
output wire [3:0] m_axi_awcache [AXI_NUM_BANKS],
output wire [2:0] m_axi_awprot [AXI_NUM_BANKS],
output wire [3:0] m_axi_awqos [AXI_NUM_BANKS],
output wire [3:0] m_axi_awregion [AXI_NUM_BANKS],
// AXI write request data channel
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata,
output wire [AXI_STROBE_WIDTH-1:0] m_axi_wstrb,
output wire m_axi_wlast,
output wire m_axi_wvalid,
input wire m_axi_wready,
output wire m_axi_wvalid [AXI_NUM_BANKS],
input wire m_axi_wready [AXI_NUM_BANKS],
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata [AXI_NUM_BANKS],
output wire [AXI_DATA_WIDTH/8-1:0] m_axi_wstrb [AXI_NUM_BANKS],
output wire m_axi_wlast [AXI_NUM_BANKS],
// AXI write response channel
input wire [AXI_TID_WIDTH-1:0] m_axi_bid,
input wire [1:0] m_axi_bresp,
input wire m_axi_bvalid,
output wire m_axi_bready,
input wire m_axi_bvalid [AXI_NUM_BANKS],
output wire m_axi_bready [AXI_NUM_BANKS],
input wire [AXI_TID_WIDTH-1:0] m_axi_bid [AXI_NUM_BANKS],
input wire [1:0] m_axi_bresp [AXI_NUM_BANKS],
// AXI read request channel
output wire [AXI_TID_WIDTH-1:0] m_axi_arid,
output wire [AXI_ADDR_WIDTH-1:0] m_axi_araddr,
output wire [7:0] m_axi_arlen,
output wire [2:0] m_axi_arsize,
output wire [1:0] m_axi_arburst,
output wire m_axi_arlock,
output wire [3:0] m_axi_arcache,
output wire [2:0] m_axi_arprot,
output wire [3:0] m_axi_arqos,
output wire m_axi_arvalid,
input wire m_axi_arready,
output wire m_axi_arvalid [AXI_NUM_BANKS],
input wire m_axi_arready [AXI_NUM_BANKS],
output wire [AXI_ADDR_WIDTH-1:0] m_axi_araddr [AXI_NUM_BANKS],
output wire [AXI_TID_WIDTH-1:0] m_axi_arid [AXI_NUM_BANKS],
output wire [7:0] m_axi_arlen [AXI_NUM_BANKS],
output wire [2:0] m_axi_arsize [AXI_NUM_BANKS],
output wire [1:0] m_axi_arburst [AXI_NUM_BANKS],
output wire [1:0] m_axi_arlock [AXI_NUM_BANKS],
output wire [3:0] m_axi_arcache [AXI_NUM_BANKS],
output wire [2:0] m_axi_arprot [AXI_NUM_BANKS],
output wire [3:0] m_axi_arqos [AXI_NUM_BANKS],
output wire [3:0] m_axi_arregion [AXI_NUM_BANKS],
// AXI read response channel
input wire [AXI_TID_WIDTH-1:0] m_axi_rid,
input wire [AXI_DATA_WIDTH-1:0] m_axi_rdata,
input wire [1:0] m_axi_rresp,
input wire m_axi_rlast,
input wire m_axi_rvalid,
output wire m_axi_rready,
input wire m_axi_rvalid [AXI_NUM_BANKS],
output wire m_axi_rready [AXI_NUM_BANKS],
input wire [AXI_DATA_WIDTH-1:0] m_axi_rdata [AXI_NUM_BANKS],
input wire m_axi_rlast [AXI_NUM_BANKS],
input wire [AXI_TID_WIDTH-1:0] m_axi_rid [AXI_NUM_BANKS],
input wire [1:0] m_axi_rresp [AXI_NUM_BANKS],
// DCR write request
input wire dcr_wr_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
// Status
output wire busy
);
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `XLEN), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
wire mem_req_valid;
wire mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
@@ -72,16 +98,33 @@ module Vortex_axi #(
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
wire [`XLEN-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`XLEN-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `XLEN'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `XLEN'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
end
VX_axi_adapter #(
.VX_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.VX_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.VX_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.VX_BYTEEN_WIDTH (AXI_STROBE_WIDTH),
.AXI_DATA_WIDTH (AXI_DATA_WIDTH),
.AXI_ADDR_WIDTH (AXI_ADDR_WIDTH),
.AXI_TID_WIDTH (AXI_TID_WIDTH),
.AXI_STROBE_WIDTH (AXI_STROBE_WIDTH)
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`XLEN),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.OUT_REG_RSP((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
@@ -98,9 +141,11 @@ module Vortex_axi #(
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awid (m_axi_awid),
.m_axi_awaddr (m_axi_awaddr),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr_unqual),
.m_axi_awid (m_axi_awid_unqual),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
@@ -108,22 +153,23 @@ module Vortex_axi #(
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_bid (m_axi_bid),
.m_axi_bresp (m_axi_bresp),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid_unqual),
.m_axi_bresp (m_axi_bresp),
.m_axi_arid (m_axi_arid),
.m_axi_araddr (m_axi_araddr),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
@@ -131,18 +177,21 @@ module Vortex_axi #(
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_arregion (m_axi_arregion),
.m_axi_rid (m_axi_rid),
.m_axi_rdata (m_axi_rdata),
.m_axi_rresp (m_axi_rresp),
.m_axi_rlast (m_axi_rlast),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready)
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rid (m_axi_rid_unqual),
.m_axi_rresp (m_axi_rresp)
);
`SCOPE_IO_SWITCH (1)
Vortex vortex (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
@@ -159,7 +208,11 @@ module Vortex_axi #(
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data),
.busy (busy)
);
endmodule
endmodule

View File

@@ -1,176 +0,0 @@
`include "VX_define.vh"
module VX_avs_wrapper #(
parameter AVS_DATA_WIDTH = 1,
parameter AVS_ADDR_WIDTH = 1,
parameter AVS_BURST_WIDTH = 1,
parameter AVS_BANKS = 1,
parameter REQ_TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8),
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1)
) (
input wire clk,
input wire reset,
// Memory request
input wire mem_req_valid,
input wire mem_req_rw,
input wire [AVS_BYTEENW-1:0] mem_req_byteen,
input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr,
input wire [AVS_DATA_WIDTH-1:0] mem_req_data,
input wire [REQ_TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_ready,
// Memory response
output wire mem_rsp_valid,
output wire [AVS_DATA_WIDTH-1:0] mem_rsp_data,
output wire [REQ_TAG_WIDTH-1:0] mem_rsp_tag,
input wire mem_rsp_ready,
// AVS bus
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [AVS_BANKS],
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [AVS_BANKS],
output wire [AVS_ADDR_WIDTH-1:0] avs_address [AVS_BANKS],
input wire avs_waitrequest [AVS_BANKS],
output wire avs_write [AVS_BANKS],
output wire avs_read [AVS_BANKS],
output wire [AVS_BYTEENW-1:0] avs_byteenable [AVS_BANKS],
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [AVS_BANKS],
input avs_readdatavalid [AVS_BANKS]
);
localparam BANK_ADDRW = `LOG2UP(AVS_BANKS);
// Requests handling
wire [AVS_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
wire [AVS_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_tag_out;
wire [AVS_BANKS-1:0] req_queue_going_full;
wire [AVS_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
wire [BANK_ADDRW-1:0] req_bank_sel;
if (AVS_BANKS >= 2) begin
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
end else begin
assign req_bank_sel = 0;
end
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i];
assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i);
end
for (genvar i = 0; i < AVS_BANKS; i++) begin
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.incr (avs_reqq_push[i]),
.decr (avs_reqq_pop[i]),
.full (req_queue_going_full[i]),
.size (req_queue_size[i]),
`UNUSED_PIN (empty)
);
`UNUSED_VAR (req_queue_size)
VX_fifo_queue #(
.DATAW (REQ_TAG_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.push (avs_reqq_push[i]),
.pop (avs_reqq_pop[i]),
.data_in (mem_req_tag),
.data_out (avs_reqq_tag_out[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_address[i] = mem_req_addr;
assign avs_byteenable[i] = mem_req_byteen;
assign avs_writedata[i] = mem_req_data;
assign avs_burstcount[i] = AVS_BURST_WIDTH'(1);
end
if (AVS_BANKS >= 2) begin
assign mem_req_ready = avs_reqq_ready[req_bank_sel];
end else begin
assign mem_req_ready = avs_reqq_ready;
end
// Responses handling
wire [AVS_BANKS-1:0] rsp_arb_valid_in;
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
wire [AVS_BANKS-1:0] rsp_arb_ready_in;
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
wire [AVS_BANKS-1:0] avs_rspq_empty;
for (genvar i = 0; i < AVS_BANKS; i++) begin
VX_fifo_queue #(
.DATAW (AVS_DATA_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
.push (avs_readdatavalid[i]),
.pop (avs_reqq_pop[i]),
.data_in (avs_readdata[i]),
.data_out (avs_rspq_data_out[i]),
.empty (avs_rspq_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
for (genvar i = 0; i < AVS_BANKS; i++) begin
assign rsp_arb_valid_in[i] = !avs_rspq_empty[i];
assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_tag_out[i]};
assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
end
VX_stream_arbiter #(
.NUM_REQS (AVS_BANKS),
.DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH),
.TYPE ("R")
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.data_in (rsp_arb_data_in),
.ready_in (rsp_arb_ready_in),
.valid_out (mem_rsp_valid),
.data_out ({mem_rsp_data, mem_rsp_tag}),
.ready_out (mem_rsp_ready)
);
`ifdef DBG_TRACE_AFU
always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw) begin
dpi_trace("%d: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
end else begin
dpi_trace("%d: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
end
end
if (mem_rsp_valid && mem_rsp_ready) begin
dpi_trace("%d: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d\n", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
end
end
`endif
endmodule

View File

@@ -1,181 +0,0 @@
`include "VX_define.vh"
module VX_to_mem #(
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter SRC_DATA_SIZE = (SRC_DATA_WIDTH / 8),
parameter DST_DATA_SIZE = (DST_DATA_WIDTH / 8)
) (
input wire clk,
input wire reset,
input wire mem_req_valid_in,
input wire [SRC_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire mem_req_rw_in,
input wire [SRC_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [SRC_DATA_WIDTH-1:0] mem_req_data_in,
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
output wire mem_req_valid_out,
output wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire mem_req_rw_out,
output wire [DST_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [DST_DATA_WIDTH-1:0] mem_req_data_out,
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
input wire mem_rsp_valid_in,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
output wire mem_rsp_valid_out,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
localparam DST_LDATAW = $clog2(DST_DATA_WIDTH);
localparam SRC_LDATAW = $clog2(SRC_DATA_WIDTH);
localparam D = `ABS(DST_LDATAW - SRC_LDATAW);
localparam P = 2**D;
`UNUSED_VAR (mem_rsp_tag_in)
if (DST_LDATAW > SRC_LDATAW) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
wire [D-1:0] rsp_idx = mem_rsp_tag_in[D-1:0];
wire [SRC_ADDR_WIDTH-D-1:0] mem_req_addr_in_qual = mem_req_addr_in[SRC_ADDR_WIDTH-1:D];
wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w = mem_rsp_data_in;
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
end else begin
assign mem_req_addr_out = mem_req_addr_in_qual;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_data_out = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
assign mem_req_tag_out = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
assign mem_req_ready_in = mem_req_ready_out;
assign mem_rsp_valid_out = mem_rsp_valid_in;
assign mem_rsp_data_out = mem_rsp_data_in_w[rsp_idx];
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in[SRC_TAG_WIDTH+D-1:D]);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end else if (DST_LDATAW < SRC_LDATAW) begin
reg [D-1:0] req_ctr, rsp_ctr;
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_in && mem_rsp_ready_in;
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
always @(*) begin
mem_rsp_data_out_n = mem_rsp_data_out_r;
if (mem_rsp_in_fire) begin
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_in;
end
end
always @(posedge clk) begin
if (reset) begin
req_ctr <= 0;
rsp_ctr <= 0;
end else begin
if (mem_req_out_fire) begin
req_ctr <= req_ctr + 1;
end
if (mem_rsp_in_fire) begin
rsp_ctr <= rsp_ctr + 1;
end
end
mem_rsp_data_out_r <= mem_rsp_data_out_n;
end
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
always @(posedge clk) begin
if (mem_rsp_in_fire) begin
mem_rsp_tag_in_r <= mem_rsp_tag_in;
end
end
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in),
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in))
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
end else begin
assign mem_req_addr_out = mem_req_addr_in_qual;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = mem_req_byteen_in_w[req_ctr];
assign mem_req_data_out = mem_req_data_in_w[req_ctr];
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
assign mem_req_ready_in = mem_req_ready_out && (req_ctr == (P-1));
assign mem_rsp_valid_out = mem_rsp_valid_in && (rsp_ctr == (P-1));
assign mem_rsp_data_out = mem_rsp_data_out_n;
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
`UNUSED_VAR (mem_req_addr_in)
assign mem_req_addr_out = mem_req_addr_in[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in);
end else begin
assign mem_req_addr_out = mem_req_addr_in;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = mem_req_byteen_in;
assign mem_req_data_out = mem_req_data_in;
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
assign mem_req_ready_in = mem_req_ready_out;
assign mem_rsp_valid_out = mem_rsp_valid_in;
assign mem_rsp_data_out = mem_rsp_data_in;
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end
endmodule

View File

@@ -241,4 +241,4 @@ typedef union packed {
t_ccip_c0_ReqMmioHdr reqMmioHdr;
} t_if_ccip_c0_RxHdr;
endpackage
endpackage

View File

@@ -45,4 +45,4 @@ begin
pck_af2cp_sTx_T1 = pck_af2cp_sTx_T0_q;
end
endmodule
endmodule

View File

@@ -58,4 +58,4 @@ package local_mem_cfg_pkg;
endpackage // local_mem_cfg_pkg
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,39 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_DCR_WRITE 4
`define AFU_IMAGE_CMD_MAX_VALUE 4
`define AFU_IMAGE_MMIO_CMD_TYPE 10
`define AFU_IMAGE_MMIO_CMD_ARG0 12
`define AFU_IMAGE_MMIO_CMD_ARG1 14
`define AFU_IMAGE_MMIO_CMD_ARG2 16
`define AFU_IMAGE_MMIO_STATUS 18
`define AFU_IMAGE_MMIO_SCOPE_READ 20
`define AFU_IMAGE_MMIO_SCOPE_WRITE 22
`define AFU_IMAGE_MMIO_DEV_CAPS 24
`define AFU_IMAGE_MMIO_ISA_CAPS 26
`define AFU_IMAGE_POWER 0
`define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
`endif // VORTEX_AFU_VH

File diff suppressed because it is too large Load Diff

View File

@@ -1,44 +0,0 @@
`ifndef __VORTEX_AFU__
`define __VORTEX_AFU__
`include "ccip_if_pkg.sv"
`define PLATFORM_PROVIDES_LOCAL_MEMORY
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
`define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif
`include "local_mem_cfg_pkg.sv"
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_MMIO_CMD_TYPE 10
`define AFU_IMAGE_MMIO_DATA_SIZE 16
`define AFU_IMAGE_MMIO_IO_ADDR 12
`define AFU_IMAGE_MMIO_MEM_ADDR 14
`define AFU_IMAGE_MMIO_SCOPE_READ 20
`define AFU_IMAGE_MMIO_SCOPE_WRITE 22
`define AFU_IMAGE_MMIO_DEV_CAPS 24
`define AFU_IMAGE_MMIO_STATUS 18
`define AFU_IMAGE_POWER 0
`define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
`endif

View File

@@ -0,0 +1,419 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "vortex_afu.vh"
module VX_afu_ctrl #(
parameter AXI_ADDR_WIDTH = 8,
parameter AXI_DATA_WIDTH = 32,
parameter AXI_NUM_BANKS = 1
) (
// axi4 lite slave signals
input wire clk,
input wire reset,
input wire clk_en,
input wire s_axi_awvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
output wire s_axi_awready,
input wire s_axi_wvalid,
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
output wire s_axi_wready,
output wire s_axi_bvalid,
output wire [1:0] s_axi_bresp,
input wire s_axi_bready,
input wire s_axi_arvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
output wire s_axi_arready,
output wire s_axi_rvalid,
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
output wire ap_reset,
output wire ap_start,
input wire ap_done,
input wire ap_ready,
input wire ap_idle,
output wire interrupt,
`ifdef SCOPE
input wire scope_bus_in,
output wire scope_bus_out,
`endif
output wire [63:0] mem_base [AXI_NUM_BANKS],
output wire dcr_wr_valid,
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
);
// Address Info
// 0x00 : Control signals
// bit 0 - ap_start (Read/Write/COH)
// bit 1 - ap_done (Read/COR)
// bit 2 - ap_idle (Read)
// bit 3 - ap_ready (Read)
// bit 4 - ap_reset (Write)
// bit 7 - auto_restart (Read/Write)
// others - reserved
// 0x04 : Global Interrupt Enable Register
// bit 0 - Global Interrupt Enable (Read/Write)
// others - reserved
// 0x08 : IP Interrupt Enable Register (Read/Write)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x0c : IP Interrupt Status Register (Read/TOW)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x10 : Low 32-bit Data signal of DEV_CAPS
// 0x14 : High 32-bit Data signal of DEV_CAPS
// 0x18 : Control signal of DEV_CAPS
// 0x1C : Low 32-bit Data signal of ISA_CAPS
// 0x20 : High 32-bit Data signal of ISA_CAPS
// 0x24 : Control signal of ISA_CAPS
// 0x28 : Low 32-bit Data signal of DCR
// 0x2C : High 32-bit Data signal of DCR
// 0x30 : Control signal of DCR
// 0x34 : Low 32-bit Data signal of SCP
// 0x38 : High 32-bit Data signal of SCP
// 0x3C : Control signal of SCP
// 0x40 : Low 32-bit Data signal of MEM
// 0x44 : High 32-bit Data signal of MEM
// 0x48 : Control signal of MEM
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
// Parameters
localparam
ADDR_AP_CTRL = 8'h00,
ADDR_GIE = 8'h04,
ADDR_IER = 8'h08,
ADDR_ISR = 8'h0C,
ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14,
ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20,
ADDR_ISA_CTRL = 8'h24,
ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C,
ADDR_DCR_CTRL = 8'h30,
ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38,
ADDR_SCP_CTRL = 8'h3C,
ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44,
ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8;
localparam
WSTATE_IDLE = 2'd0,
WSTATE_DATA = 2'd1,
WSTATE_RESP = 2'd2;
localparam
RSTATE_IDLE = 2'd0,
RSTATE_DATA = 2'd1;
// device caps
wire [63:0] dev_caps = {16'b0,
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),
8'(`IMPLEMENTATION_ID)};
wire [63:0] isa_caps = {32'(`MISA_EXT),
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [1:0] wstate;
reg [ADDR_BITS-1:0] waddr;
wire [31:0] wmask;
wire s_axi_aw_fire;
wire s_axi_w_fire;
reg [1:0] rstate;
reg [31:0] rdata;
wire [ADDR_BITS-1:0] raddr;
wire s_axi_ar_fire;
reg ap_reset_r;
reg ap_start_r;
reg auto_restart_r;
reg gie_r;
reg [1:0] ier_r;
reg [1:0] isr_r;
reg [63:0] mem_r [AXI_NUM_BANKS];
reg [31:0] dcra_r;
reg [31:0] dcrv_r;
reg dcr_wr_valid_r;
`ifdef SCOPE
reg [63:0] scope_bus_wdata;
reg [63:0] scope_bus_rdata;
reg [5:0] scope_bus_ctr;
reg cmd_scope_reading;
reg cmd_scope_writing;
reg scope_bus_out_r;
always @(posedge clk) begin
if (reset) begin
cmd_scope_reading <= 0;
cmd_scope_writing <= 0;
scope_bus_ctr <= '0;
scope_bus_out_r <= 0;
end else if (clk_en) begin
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
end
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
cmd_scope_writing <= 1;
scope_bus_out_r <= 1;
scope_bus_ctr <= 63;
end
if (scope_bus_in) begin
cmd_scope_reading <= 1;
scope_bus_ctr <= 63;
end
if (cmd_scope_reading) begin
scope_bus_rdata <= {scope_bus_rdata[62:0], scope_bus_in};
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
end
end
if (cmd_scope_writing) begin
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
end
end
end
end
assign scope_bus_out = scope_bus_out_r;
`endif
// AXI Write
assign s_axi_awready = (wstate == WSTATE_IDLE);
assign s_axi_wready = (wstate == WSTATE_DATA);
assign s_axi_bvalid = (wstate == WSTATE_RESP);
assign s_axi_bresp = 2'b00; // OKAY
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
for (genvar i = 0; i < 4; ++i) begin
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
end
// wstate
always @(posedge clk) begin
if (reset) begin
wstate <= WSTATE_IDLE;
end else if (clk_en) begin
case (wstate)
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
default: wstate <= WSTATE_IDLE;
endcase
end
end
// waddr
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_aw_fire)
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
end
end
// wdata
always @(posedge clk) begin
if (reset) begin
ap_start_r <= 0;
ap_reset_r <= 0;
auto_restart_r <= 0;
gie_r <= 0;
ier_r <= '0;
isr_r <= '0;
dcra_r <= '0;
dcrv_r <= '0;
dcr_wr_valid_r <= 0;
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
mem_r[i] <= '0;
end
end else if (clk_en) begin
if (ap_ready)
ap_start_r <= auto_restart_r;
dcr_wr_valid_r <= 0;
if (s_axi_w_fire) begin
case (waddr)
ADDR_AP_CTRL: begin
if (s_axi_wstrb[0]) begin
if (s_axi_wdata[0])
ap_start_r <= 1;
if (s_axi_wdata[4])
ap_reset_r <= 1;
if (s_axi_wdata[7])
auto_restart_r <= 1;
end
end
ADDR_GIE: begin
if (s_axi_wstrb[0])
gie_r <= s_axi_wdata[0];
end
ADDR_IER: begin
if (s_axi_wstrb[0])
ier_r <= s_axi_wdata[1:0];
end
ADDR_ISR: begin
if (s_axi_wstrb[0])
isr_r <= isr_r ^ s_axi_wdata[1:0];
end
ADDR_DCR_0: begin
dcra_r <= (s_axi_wdata & wmask) | (dcra_r & ~wmask);
end
ADDR_DCR_1: begin
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
dcr_wr_valid_r <= 1;
end
default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + i * 12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end
if (waddr == (ADDR_MEM_1 + i * 12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end
end
end
endcase
if (ier_r[0] & ap_done)
isr_r[0] <= 1'b1;
if (ier_r[1] & ap_ready)
isr_r[1] <= 1'b1;
end
end
end
// AXI Read
assign s_axi_arready = (rstate == RSTATE_IDLE);
assign s_axi_rvalid = (rstate == RSTATE_DATA);
assign s_axi_rdata = rdata;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
// rstate
always @(posedge clk) begin
if (reset) begin
rstate <= RSTATE_IDLE;
end else if (clk_en) begin
case (rstate)
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
default: rstate <= RSTATE_IDLE;
endcase
end
end
// rdata
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_ar_fire) begin
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
end
end
end
assign ap_reset = ap_reset_r;
assign ap_start = ap_start_r;
assign interrupt = gie_r & (| isr_r);
assign mem_base = mem_r;
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
endmodule

View File

@@ -0,0 +1,412 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "vortex_afu.vh"
module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = 16,
parameter C_M_AXI_MEM_ADDR_WIDTH = 32,
parameter C_M_AXI_MEM_DATA_WIDTH = 512
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
);
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
localparam STATE_IDLE = 0;
localparam STATE_RUN = 1;
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rlast_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_rid_a [C_M_AXI_MEM_NUM_BANKS];
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
// convert memory interface to array
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire clk = ap_clk;
wire reset = ~ap_rst_n;
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [15:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_running;
wire vx_busy;
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire dcr_wr_valid;
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
reg state;
wire ap_reset;
wire ap_start;
wire ap_idle = ~vx_running;
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
wire ap_ready = 1'b1;
`ifdef SCOPE
wire scope_bus_in;
wire scope_bus_out;
wire scope_reset = reset;
`endif
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
end else begin
case (state)
STATE_IDLE: begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`endif
state <= STATE_RUN;
vx_running <= 0;
end
end
STATE_RUN: begin
if (vx_running) begin
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
`TRACE(2, ("%d: STATE IDLE\n", $time));
`endif
end
end
end else begin
// wait until the reset sequence is complete
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
end
end
endcase
end
end
reg m_axi_mem_wfire;
reg m_axi_mem_bfire;
always @(*) begin
m_axi_mem_wfire = 0;
m_axi_mem_bfire = 0;
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
end
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
vx_pending_writes <= '0;
end else begin
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes + 1;
if (~m_axi_mem_wfire && m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes - 1;
end
end
always @(posedge ap_clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
end else begin
vx_reset_ctr <= '0;
end
end
VX_afu_ctrl #(
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) afu_ctrl (
.clk (ap_clk),
.reset (reset || ap_reset),
.clk_en (1'b1),
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
.s_axi_awaddr (s_axi_ctrl_awaddr),
.s_axi_wvalid (s_axi_ctrl_wvalid),
.s_axi_wready (s_axi_ctrl_wready),
.s_axi_wdata (s_axi_ctrl_wdata),
.s_axi_wstrb (s_axi_ctrl_wstrb),
.s_axi_arvalid (s_axi_ctrl_arvalid),
.s_axi_arready (s_axi_ctrl_arready),
.s_axi_araddr (s_axi_ctrl_araddr),
.s_axi_rvalid (s_axi_ctrl_rvalid),
.s_axi_rready (s_axi_ctrl_rready),
.s_axi_rdata (s_axi_ctrl_rdata),
.s_axi_rresp (s_axi_ctrl_rresp),
.s_axi_bvalid (s_axi_ctrl_bvalid),
.s_axi_bready (s_axi_ctrl_bready),
.s_axi_bresp (s_axi_ctrl_bresp),
.ap_reset (ap_reset),
.ap_start (ap_start),
.ap_done (ap_done),
.ap_ready (ap_ready),
.ap_idle (ap_idle),
.interrupt (interrupt),
`ifdef SCOPE
.scope_bus_in (scope_bus_out),
.scope_bus_out (scope_bus_in),
`endif
.mem_base (mem_base),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data)
);
wire [`XLEN-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`XLEN-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
end
`SCOPE_IO_SWITCH (2)
Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (`XLEN),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi (
`SCOPE_IO_BIND (1)
.clk (ap_clk),
.reset (reset || ap_reset || ~vx_running),
.m_axi_awvalid (m_axi_mem_awvalid_a),
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awid (m_axi_mem_awid_a),
`UNUSED_PIN (m_axi_awlen),
`UNUSED_PIN (m_axi_awsize),
`UNUSED_PIN (m_axi_awburst),
`UNUSED_PIN (m_axi_awlock),
`UNUSED_PIN (m_axi_awcache),
`UNUSED_PIN (m_axi_awprot),
`UNUSED_PIN (m_axi_awqos),
`UNUSED_PIN (m_axi_awregion),
.m_axi_wvalid (m_axi_mem_wvalid_a),
.m_axi_wready (m_axi_mem_wready_a),
.m_axi_wdata (m_axi_mem_wdata_a),
.m_axi_wstrb (m_axi_mem_wstrb_a),
.m_axi_wlast (m_axi_mem_wlast_a),
.m_axi_bvalid (m_axi_mem_bvalid_a),
.m_axi_bready (m_axi_mem_bready_a),
.m_axi_bid (m_axi_mem_bid_a),
.m_axi_bresp (m_axi_mem_bresp_a),
.m_axi_arvalid (m_axi_mem_arvalid_a),
.m_axi_arready (m_axi_mem_arready_a),
.m_axi_araddr (m_axi_mem_araddr_w),
.m_axi_arid (m_axi_mem_arid_a),
.m_axi_arlen (m_axi_mem_arlen_a),
`UNUSED_PIN (m_axi_arsize),
`UNUSED_PIN (m_axi_arburst),
`UNUSED_PIN (m_axi_arlock),
`UNUSED_PIN (m_axi_arcache),
`UNUSED_PIN (m_axi_arprot),
`UNUSED_PIN (m_axi_arqos),
`UNUSED_PIN (m_axi_arregion),
.m_axi_rvalid (m_axi_mem_rvalid_a),
.m_axi_rready (m_axi_mem_rready_a),
.m_axi_rdata (m_axi_mem_rdata_a),
.m_axi_rlast (m_axi_mem_rlast_a),
.m_axi_rid (m_axi_mem_rid_a),
.m_axi_rresp (m_axi_mem_rresp_a),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data),
.busy (vx_busy)
);
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
`define TRIGGERS { \
reset, \
ap_start, \
ap_done, \
ap_idle, \
interrupt, \
vx_busy_wait, \
vx_busy, \
vx_running \
}
`define PROBES { \
vx_pending_writes \
}
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW ($bits(`TRIGGERS)),
.PROBEW ($bits(`PROBES))
) scope_tap (
.clk(clk),
.reset(scope_reset_w[0]),
.start(1'b0),
.stop(1'b0),
.triggers(`TRIGGERS),
.probes(`PROBES),
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
);
`endif
`ifdef CHIPSCOPE
ila_afu ila_afu_inst (
.clk (ap_clk),
.probe0 ({
ap_start,
ap_done,
ap_idle,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_running
})
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif
`ifdef SIMULATION
`ifndef VERILATOR
// disable assertions until full reset
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
reg assert_enabled;
initial begin
$assertoff(0, vortex_axi);
end
always @(posedge ap_clk) begin
if (reset) begin
assert_delay_ctr <= '0;
assert_enabled <= 0;
end else begin
if (~assert_enabled) begin
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
assert_enabled <= 1;
$asserton(0, vortex_axi); // enable assertions
end else begin
assert_delay_ctr <= assert_delay_ctr + 1;
end
end
end
end
`endif
`endif
`ifdef DBG_TRACE_AFU
always @(posedge ap_clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i]));
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
end
end
end
`endif
endmodule

View File

@@ -0,0 +1,85 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "vortex_afu.vh"
module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
);
VX_afu_wrap #(
.C_S_AXI_CTRL_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
) afu_wrap (
.ap_clk (ap_clk),
.ap_rst_n (ap_rst_n),
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
.s_axi_ctrl_wready (s_axi_ctrl_wready),
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
.s_axi_ctrl_arready (s_axi_ctrl_arready),
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
.s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
.interrupt (interrupt)
);
endmodule

View File

@@ -0,0 +1,108 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`ifndef M_AXI_MEM_NUM_BANKS
`define M_AXI_MEM_NUM_BANKS 1
`endif
`ifndef M_AXI_MEM_ID_WIDTH
`define M_AXI_MEM_ID_WIDTH 32
`endif
`define GEN_AXI_MEM(i) \
output wire m_axi_mem_``i``_awvalid, \
input wire m_axi_mem_``i``_awready, \
output wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_``i``_awaddr, \
output wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_awid, \
output wire [7:0] m_axi_mem_``i``_awlen, \
output wire m_axi_mem_``i``_wvalid, \
input wire m_axi_mem_``i``_wready, \
output wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_``i``_wdata, \
output wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_``i``_wstrb, \
output wire m_axi_mem_``i``_wlast, \
output wire m_axi_mem_``i``_arvalid, \
input wire m_axi_mem_``i``_arready, \
output wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_``i``_araddr, \
output wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_arid, \
output wire [7:0] m_axi_mem_``i``_arlen, \
input wire m_axi_mem_``i``_rvalid, \
output wire m_axi_mem_``i``_rready, \
input wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_``i``_rdata, \
input wire m_axi_mem_``i``_rlast, \
input wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_rid, \
input wire [1:0] m_axi_mem_``i``_rresp, \
input wire m_axi_mem_``i``_bvalid, \
output wire m_axi_mem_``i``_bready, \
input wire [1:0] m_axi_mem_``i``_bresp, \
input wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_bid
`define AXI_MEM_ARGS(i) \
.m_axi_mem_``i``_awvalid(m_axi_mem_``i``_awvalid), \
.m_axi_mem_``i``_awready(m_axi_mem_``i``_awready), \
.m_axi_mem_``i``_awaddr(m_axi_mem_``i``_awaddr), \
.m_axi_mem_``i``_awid(m_axi_mem_``i``_awid), \
.m_axi_mem_``i``_awlen(m_axi_mem_``i``_awlen), \
.m_axi_mem_``i``_wvalid(m_axi_mem_``i``_wvalid), \
.m_axi_mem_``i``_wready(m_axi_mem_``i``_wready), \
.m_axi_mem_``i``_wdata(m_axi_mem_``i``_wdata), \
.m_axi_mem_``i``_wstrb(m_axi_mem_``i``_wstrb), \
.m_axi_mem_``i``_wlast(m_axi_mem_``i``_wlast), \
.m_axi_mem_``i``_arvalid(m_axi_mem_``i``_arvalid), \
.m_axi_mem_``i``_arready(m_axi_mem_``i``_arready), \
.m_axi_mem_``i``_araddr(m_axi_mem_``i``_araddr), \
.m_axi_mem_``i``_arid(m_axi_mem_``i``_arid), \
.m_axi_mem_``i``_arlen(m_axi_mem_``i``_arlen), \
.m_axi_mem_``i``_rvalid(m_axi_mem_``i``_rvalid), \
.m_axi_mem_``i``_rready(m_axi_mem_``i``_rready), \
.m_axi_mem_``i``_rdata(m_axi_mem_``i``_rdata), \
.m_axi_mem_``i``_rlast(m_axi_mem_``i``_rlast), \
.m_axi_mem_``i``_rid(m_axi_mem_``i``_rid), \
.m_axi_mem_``i``_rresp(m_axi_mem_``i``_rresp), \
.m_axi_mem_``i``_bvalid(m_axi_mem_``i``_bvalid), \
.m_axi_mem_``i``_bready(m_axi_mem_``i``_bready), \
.m_axi_mem_``i``_bresp(m_axi_mem_``i``_bresp), \
.m_axi_mem_``i``_bid(m_axi_mem_``i``_bid)
`define AXI_MEM_TO_ARRAY(i) \
assign m_axi_mem_``i``_awvalid = m_axi_mem_awvalid_a[i]; \
assign m_axi_mem_awready_a[i] = m_axi_mem_``i``_awready; \
assign m_axi_mem_``i``_awaddr = m_axi_mem_awaddr_a[i]; \
assign m_axi_mem_``i``_awid = m_axi_mem_awid_a[i]; \
assign m_axi_mem_``i``_awlen = m_axi_mem_awlen_a[i]; \
assign m_axi_mem_``i``_wvalid = m_axi_mem_wvalid_a[i]; \
assign m_axi_mem_wready_a[i] = m_axi_mem_``i``_wready; \
assign m_axi_mem_``i``_wdata = m_axi_mem_wdata_a[i]; \
assign m_axi_mem_``i``_wstrb = m_axi_mem_wstrb_a[i]; \
assign m_axi_mem_``i``_wlast = m_axi_mem_wlast_a[i]; \
assign m_axi_mem_``i``_arvalid = m_axi_mem_arvalid_a[i]; \
assign m_axi_mem_arready_a[i] = m_axi_mem_``i``_arready; \
assign m_axi_mem_``i``_araddr = m_axi_mem_araddr_a[i]; \
assign m_axi_mem_``i``_arid = m_axi_mem_arid_a[i]; \
assign m_axi_mem_``i``_arlen = m_axi_mem_arlen_a[i]; \
assign m_axi_mem_rvalid_a[i] = m_axi_mem_``i``_rvalid; \
assign m_axi_mem_``i``_rready = m_axi_mem_rready_a[i]; \
assign m_axi_mem_rdata_a[i] = m_axi_mem_``i``_rdata; \
assign m_axi_mem_rlast_a[i] = m_axi_mem_``i``_rlast; \
assign m_axi_mem_rid_a[i] = m_axi_mem_``i``_rid; \
assign m_axi_mem_rresp_a[i] = m_axi_mem_``i``_rresp; \
assign m_axi_mem_bvalid_a[i] = m_axi_mem_``i``_bvalid; \
assign m_axi_mem_``i``_bready = m_axi_mem_bready_a[i]; \
assign m_axi_mem_bresp_a[i] = m_axi_mem_``i``_bresp; \
assign m_axi_mem_bid_a[i] = m_axi_mem_``i``_bid
`include "VX_define.vh"
`endif // VORTEX_AFU_VH

View File

@@ -1,511 +0,0 @@
`include "VX_cache_define.vh"
module VX_bank #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of bankS
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Core Request Queue Size
parameter CREQ_SIZE = 1,
// Core Response Queue Size
parameter CRSQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0,
parameter MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE),
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
) (
`SCOPE_IO_VX_bank
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
`endif
// Core Request
input wire core_req_valid,
input wire [NUM_PORTS-1:0] core_req_pmask,
input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] core_req_wsel,
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data,
input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid,
input wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag,
input wire core_req_rw,
input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr,
output wire core_req_ready,
// Core Response
output wire core_rsp_valid,
output wire [NUM_PORTS-1:0] core_rsp_pmask,
output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [NUM_PORTS-1:0] mem_req_pmask,
output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen,
output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel,
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
output wire mem_rsp_ready,
// flush
input wire flush_enable,
input wire [`LINE_SELECT_BITS-1:0] flush_addr
);
`IGNORE_UNUSED_BEGIN
wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1;
`IGNORE_UNUSED_END
wire [NUM_PORTS-1:0] creq_pmask;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] creq_tag;
wire creq_rw;
wire [`LINE_ADDR_WIDTH-1:0] creq_addr;
wire creq_valid, creq_ready;
VX_elastic_buffer #(
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
.SIZE (CREQ_SIZE)
) core_req_queue (
.clk (clk),
.reset (reset),
.ready_in (core_req_ready),
.valid_in (core_req_valid),
.data_in ({core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid, core_req_tag}),
.data_out ({creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid, creq_tag}),
.ready_out (creq_ready),
.valid_out (creq_valid)
);
wire mreq_alm_full;
wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
wire crsq_valid, crsq_ready;
wire crsq_stall;
wire mshr_valid;
wire mshr_ready;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id;
wire mshr_alm_full;
wire [MSHR_ADDR_WIDTH-1:0] mshr_dequeue_id;
wire [`LINE_ADDR_WIDTH-1:0] mshr_addr;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] mshr_tag;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mshr_wsel;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid;
wire [NUM_PORTS-1:0] mshr_pmask;
wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1;
wire is_read_st0, is_read_st1;
wire is_write_st0, is_write_st1;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel_st0, wsel_st1;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1;
wire [NUM_PORTS-1:0] pmask_st0, pmask_st1;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] rdata_st1;
wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
wire valid_st0, valid_st1;
wire is_fill_st0, is_fill_st1;
wire is_mshr_st0, is_mshr_st1;
wire miss_st0, miss_st1;
wire is_flush_st0;
wire mshr_pending_st0, mshr_pending_st1;
// prevent read-during-write hazard when accessing tags/data block RAMs
wire rdw_fill_hazard = valid_st0 && is_fill_st0;
wire rdw_write_hazard = valid_st0 && is_write_st0 && ~creq_rw;
// determine which queue to pop next in priority order
wire mshr_grant = !flush_enable;
wire mshr_enable = mshr_grant && mshr_valid;
wire mrsq_grant = !flush_enable && !mshr_enable;
wire mrsq_enable = mrsq_grant && mem_rsp_valid;
wire creq_grant = !flush_enable && !mshr_enable && !mrsq_enable;
wire creq_enable = creq_grant && creq_valid;
assign mshr_ready = mshr_grant
&& !rdw_fill_hazard // prevent read-during-write hazard
&& !crsq_stall; // ensure core_rsp_queue not full
assign mem_rsp_ready = mrsq_grant
&& !crsq_stall; // ensure core_rsp_queue not full
assign creq_ready = creq_grant
&& !rdw_write_hazard // prevent read-during-write hazard
&& !mreq_alm_full // ensure mem_req_queue not full
&& !mshr_alm_full // ensure mshr not full
&& !crsq_stall; // ensure core_rsp_queue not full
wire flush_fire = flush_enable;
wire mshr_fire = mshr_valid && mshr_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire creq_fire = creq_valid && creq_ready;
assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG];
wire [`CACHE_LINE_WIDTH-1:0] wdata_sel;
assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data;
for (genvar i = NUM_PORTS * `WORD_WIDTH; i < `CACHE_LINE_WIDTH; ++i) begin
assign wdata_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (!crsq_stall),
.data_in ({
flush_fire || mshr_fire || mem_rsp_fire || creq_fire,
flush_enable,
mshr_enable,
mrsq_enable,
creq_enable && ~creq_rw,
creq_enable && creq_rw,
flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : (mshr_valid ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : creq_addr)),
wdata_sel,
mshr_valid ? mshr_wsel : creq_wsel,
creq_byteen,
mshr_valid ? mshr_tid : creq_tid,
mshr_valid ? mshr_pmask : creq_pmask,
mshr_valid ? mshr_tag : creq_tag,
mshr_valid ? mshr_dequeue_id : mem_rsp_id
}),
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0})
);
assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG];
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_flush_st0);
wire tag_match_st0;
VX_tag_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
) tag_access (
.clk (clk),
.reset (reset),
.req_id (req_id_st0),
.stall (crsq_stall),
// read/Fill
.lookup (do_lookup_st0),
.addr (addr_st0),
.fill (do_fill_st0),
.flush (do_flush_st0),
.tag_match (tag_match_st0)
);
// we have a core request hit
assign miss_st0 = (is_read_st0 || is_write_st0) && ~tag_match_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_a_st0 = (is_read_st0 || is_write_st0) ? mshr_alloc_id : mshr_id_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (!crsq_stall),
.data_in ({valid_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, miss_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_a_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1})
);
assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG];
wire do_read_st0 = valid_st0 && is_read_st0;
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
wire do_mshr_st1 = valid_st1 && is_mshr_st1;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data_st1 = wdata_st1[0 +: NUM_PORTS * `WORD_WIDTH];
`UNUSED_VAR (wdata_st1)
VX_data_access #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_SIZE (CACHE_SIZE),
.CACHE_LINE_SIZE(CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE)
) data_access (
.clk (clk),
.reset (reset),
.req_id (req_id_st1),
.stall (crsq_stall),
.read (do_read_st1 || do_mshr_st1),
.fill (do_fill_st1),
.write (do_write_st1 && !miss_st1),
.addr (addr_st1),
.wsel (wsel_st1),
.pmask (pmask_st1),
.byteen (byteen_st1),
.fill_data (wdata_st1),
.write_data (creq_data_st1),
.read_data (rdata_st1)
);
wire mshr_allocate = do_read_st0 && !crsq_stall;
wire mshr_replay = do_fill_st0 && !crsq_stall;
wire mshr_lookup = mshr_allocate;
wire mshr_release = do_read_st1 && !miss_st1 && !crsq_stall;
VX_pending_size #(
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (creq_fire && ~creq_rw),
.decr (mshr_fire || mshr_release),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
VX_miss_resrv #(
.BANK_ID (BANK_ID),
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MSHR_SIZE (MSHR_SIZE),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH)
) miss_resrv (
.clk (clk),
.reset (reset),
.deq_req_id (req_id_sel),
.lkp_req_id (req_id_st0),
.rel_req_id (req_id_st1),
// allocate
.allocate_valid (mshr_allocate),
.allocate_addr (addr_st0),
.allocate_data ({wsel_st0, tag_st0, req_tid_st0, pmask_st0}),
.allocate_id (mshr_alloc_id),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup),
.lookup_replay (mshr_replay),
.lookup_id (mshr_alloc_id),
.lookup_addr (addr_st0),
.lookup_match (mshr_pending_st0),
// fill
.fill_valid (mem_rsp_fire),
.fill_id (mem_rsp_id),
.fill_addr (mem_rsp_addr),
// dequeue
.dequeue_valid (mshr_valid),
.dequeue_id (mshr_dequeue_id),
.dequeue_addr (mshr_addr),
.dequeue_data ({mshr_wsel, mshr_tag, mshr_tid, mshr_pmask}),
.dequeue_ready (mshr_ready),
// release
.release_valid (mshr_release),
.release_id (mshr_id_st1)
);
// Enqueue core response
wire [NUM_PORTS-1:0] crsq_pmask;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data;
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid;
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] crsq_tag;
assign crsq_valid = (do_read_st1 && !miss_st1)
|| do_mshr_st1;
assign crsq_stall = crsq_valid && !crsq_ready;
assign crsq_pmask = pmask_st1;
assign crsq_tid = req_tid_st1;
assign crsq_data = rdata_st1;
assign crsq_tag = tag_st1;
VX_elastic_buffer #(
.DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)),
.SIZE (CRSQ_SIZE),
.OUT_REG (1)
) core_rsp_req (
.clk (clk),
.reset (reset),
.valid_in (crsq_valid),
.data_in ({crsq_tag, crsq_pmask, crsq_data, crsq_tid}),
.ready_in (crsq_ready),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tag, core_rsp_pmask, core_rsp_data, core_rsp_tid}),
.ready_out (core_rsp_ready)
);
// Enqueue memory request
wire mreq_push, mreq_pop, mreq_empty;
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data;
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen;
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel;
wire [NUM_PORTS-1:0] mreq_pmask;
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
wire mreq_rw;
assign mreq_push = (do_read_st1 && miss_st1 && !mshr_pending_st1)
|| do_write_st1;
assign mreq_pop = mem_req_valid && mem_req_ready;
assign mreq_rw = WRITE_ENABLE && is_write_st1;
assign mreq_addr = addr_st1;
assign mreq_id = mshr_id_st1;
assign mreq_pmask= pmask_st1;
assign mreq_wsel = wsel_st1;
assign mreq_byteen = byteen_st1;
assign mreq_data = creq_data_st1;
VX_fifo_queue #(
.DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
.SIZE (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2),
.OUT_REG (1 == NUM_BANKS)
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_pmask, mreq_byteen, mreq_wsel, mreq_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_pmask, mem_req_byteen, mem_req_wsel, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign mem_req_valid = !mreq_empty;
///////////////////////////////////////////////////////////////////////////////
`SCOPE_ASSIGN (valid_st0, valid_st0);
`SCOPE_ASSIGN (valid_st1, valid_st1);
`SCOPE_ASSIGN (is_fill_st0, is_fill_st0);
`SCOPE_ASSIGN (is_mshr_st0, is_mshr_st0);
`SCOPE_ASSIGN (miss_st0, miss_st0);
`SCOPE_ASSIGN (crsq_stall, crsq_stall);
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
`ifdef PERF_ENABLE
assign perf_read_misses = do_read_st1 && miss_st1;
assign perf_write_misses = do_write_st1 && miss_st1;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
&& ~(mshr_fire || mem_rsp_fire || creq_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
dpi_trace("%d: *** cache%0d:%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, CACHE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full);
end
if (flush_enable) begin
dpi_trace("%d: cache%0d:%0d flush: addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
end
if (mem_rsp_fire) begin
dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data);
end
if (mshr_fire) begin
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel);
end
if (creq_fire) begin
if (creq_rw)
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel);
else
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel);
end
if (crsq_fire) begin
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1);
end
if (mreq_push) begin
if (is_write_st1)
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1);
else
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1);
end
end
`endif
endmodule

1047
hw/rtl/cache/VX_cache.sv vendored

File diff suppressed because it is too large Load Diff

549
hw/rtl/cache/VX_cache_bank.sv vendored Normal file
View File

@@ -0,0 +1,549 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_bank #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output register
parameter MEM_OUT_REG = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
`endif
// Core Request
input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw,
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
input wire [WORD_SIZE-1:0] core_req_byteen,
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
output wire core_req_ready,
// Core Response
output wire core_rsp_valid,
output wire [`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [TAG_WIDTH-1:0] core_rsp_tag,
output wire [REQ_SEL_WIDTH-1:0] core_rsp_idx,
input wire core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
output wire [WORD_SIZE-1:0] mem_req_byteen,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready,
// initialization
input wire init_enable,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
);
`IGNORE_UNUSED_BEGIN
wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
`IGNORE_UNUSED_END
wire crsq_stall;
wire mshr_alm_full;
wire mreq_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
wire replay_valid;
wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr;
wire replay_rw;
wire [WORD_SEL_WIDTH-1:0] replay_wsel;
wire [WORD_SIZE-1:0] replay_byteen;
wire [`CS_WORD_WIDTH-1:0] replay_data;
wire [TAG_WIDTH-1:0] replay_tag;
wire [REQ_SEL_WIDTH-1:0] replay_idx;
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_tail_st0, mshr_tail_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire rdw_hazard_st0;
reg rdw_hazard_st1;
wire pipe_stall = crsq_stall || rdw_hazard_st1;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard_st0
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = init_enable;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
`UNUSED_VAR (mshr_creq_tag)
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({
valid_sel,
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st0 = 0;
end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
`RESET_RELAY (tag_reset, reset);
VX_cache_tags #(
.INSTANCE_ID(INSTANCE_ID),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (tag_reset),
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// read/Fill
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.fill (do_fill_st0),
.init (do_init_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0)
);
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_tail_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_tail_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| tag_matches_st1);
if (UUID_WIDTH != 0) begin
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st1 = 0;
end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // after a fill
always @(posedge clk) begin
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
&& ~rdw_hazard_st1; // after a write to same address
end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
`RESET_RELAY (data_reset, reset);
VX_cache_data #(
.INSTANCE_ID (INSTANCE_ID),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
.reset (data_reset),
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.read (do_read_hit_st1 || do_replay_rd_st1),
.fill (do_fill_st1),
.write (do_write_hit_st1 || do_replay_wr_st1),
.way_sel (way_sel_st1 | tag_matches_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.read_data (read_data_st1)
);
wire [MSHR_SIZE-1:0] mshr_matches_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
VX_pending_size #(
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #(
.INSTANCE_ID (INSTANCE_ID),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.MSHR_SIZE (MSHR_SIZE),
.UUID_WIDTH (UUID_WIDTH),
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
.clk (clk),
.reset (mshr_reset),
.deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0),
.fin_req_uuid (req_uuid_st1),
// memory fill
.fill_valid (mem_rsp_fire),
.fill_id (mem_rsp_id),
.fill_addr (mem_rsp_addr),
// dequeue
.dequeue_valid (replay_valid),
.dequeue_addr (replay_addr),
.dequeue_rw (replay_rw),
.dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx}),
.dequeue_id (replay_id),
.dequeue_ready (replay_ready),
// allocate
.allocate_valid (mshr_allocate_st0),
.allocate_addr (addr_st0),
.allocate_rw (rw_st0),
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
.allocate_id (mshr_alloc_id_st0),
.allocate_tail (mshr_tail_st0),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_matches (mshr_matches_st0),
// finalize
.finalize_valid (mshr_finalize_st1),
.finalize_release(mshr_release_st1),
.finalize_pending(mshr_pending_st1),
.finalize_id (mshr_id_st1),
.finalize_tail (mshr_tail_st1)
);
// ignore allocated id from mshr matches
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i];
end
assign mshr_pending_st0 = (| lookup_matches);
// schedule core response
wire crsq_valid, crsq_ready;
wire [`CS_WORD_WIDTH-1:0] crsq_data;
wire [REQ_SEL_WIDTH-1:0] crsq_idx;
wire [TAG_WIDTH-1:0] crsq_tag;
assign crsq_valid = do_read_hit_st1 || do_replay_rd_st1;
assign crsq_idx = req_idx_st1;
assign crsq_data = read_data_st1;
assign crsq_tag = tag_st1;
`RESET_RELAY (crsp_reset, reset);
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (CORE_OUT_REG)
) core_rsp_queue (
.clk (clk),
.reset (crsp_reset),
.valid_in (crsq_valid && ~rdw_hazard_st1),
.ready_in (crsq_ready),
.data_in ({crsq_tag, crsq_data, crsq_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
.valid_out (core_rsp_valid),
.ready_out (core_rsp_ready)
);
assign crsq_stall = crsq_valid && ~crsq_ready;
// schedule memory request
wire mreq_push, mreq_pop, mreq_empty;
wire [`CS_WORD_WIDTH-1:0] mreq_data;
wire [WORD_SIZE-1:0] mreq_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_wsel;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
wire mreq_rw;
assign mreq_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
assign mreq_pop = mem_req_valid && mem_req_ready;
assign mreq_rw = WRITE_ENABLE && rw_st1;
assign mreq_addr = addr_st1;
assign mreq_id = mshr_id_st1;
assign mreq_wsel = wsel_st1;
assign mreq_byteen = byteen_st1;
assign mreq_data = write_data_st1;
`RESET_RELAY (mreq_reset, reset);
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2),
.OUT_REG (MEM_OUT_REG)
) mem_req_queue (
.clk (clk),
.reset (mreq_reset),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign mem_req_valid = ~mreq_empty;
///////////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
assign perf_read_misses = do_read_miss_st1;
assign perf_write_misses = do_write_miss_st1;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full));
end
if (init_enable) begin
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
if (replay_fire) begin
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end
if (crsq_fire) begin
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_idx, crsq_data, req_uuid_st1));
end
if (mreq_push) begin
if (do_creq_wr_st1)
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_byteen, mreq_data, req_uuid_st1));
else
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_id, req_uuid_st1));
end
end
`endif
endmodule

348
hw/rtl/cache/VX_cache_bypass.sv vendored Normal file
View File

@@ -0,0 +1,348 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter NC_TAG_BIT = 0,
parameter NC_ENABLE = 0,
parameter PASSTHRU = 0,
parameter CORE_ADDR_WIDTH = 1,
parameter CORE_DATA_SIZE = 1,
parameter CORE_TAG_IN_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0,
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
parameter CORE_TAG_OUT_WIDTH= CORE_TAG_IN_WIDTH - NC_ENABLE
) (
input wire clk,
input wire reset,
// Core request in
input wire [NUM_REQS-1:0] core_req_valid_in,
input wire [NUM_REQS-1:0] core_req_rw_in,
input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in,
input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_req_tag_in,
output wire [NUM_REQS-1:0] core_req_ready_in,
// Core request out
output wire [NUM_REQS-1:0] core_req_valid_out,
output wire [NUM_REQS-1:0] core_req_rw_out,
output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out,
output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_req_tag_out,
input wire [NUM_REQS-1:0] core_req_ready_out,
// Core response in
input wire [NUM_REQS-1:0] core_rsp_valid_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_rsp_tag_in,
output wire [NUM_REQS-1:0] core_rsp_ready_in,
// Core response out
output wire [NUM_REQS-1:0] core_rsp_valid_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out,
input wire [NUM_REQS-1:0] core_rsp_ready_out,
// Memory request in
input wire mem_req_valid_in,
input wire mem_req_rw_in,
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
// Memory request out
output wire mem_req_valid_out,
output wire mem_req_rw_out,
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
// Memory response in
input wire mem_rsp_valid_in,
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
// Memory response out
output wire mem_rsp_valid_out,
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + CORE_DATA_SIZE + CORE_ADDR_WIDTH + 1;
localparam WORDS_PER_LINE = MEM_DATA_SIZE / CORE_DATA_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE;
// core request handling
wire [NUM_REQS-1:0] core_req_valid_in_nc;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else begin
assign core_req_nc_idxs[i] = core_req_tag_in[i][NC_TAG_BIT];
end
end
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
.LOCK_ENABLE (1)
) req_arb (
.clk (clk),
.reset (reset),
.unlock (core_req_in_fire),
.requests (core_req_valid_in_nc),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid)
);
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
assign core_req_rw_out = core_req_rw_in;
assign core_req_addr_out = core_req_addr_in;
assign core_req_byteen_out = core_req_byteen_in;
assign core_req_data_out = core_req_data_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_remove #(
.N (CORE_TAG_IN_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_req_tag_nc_remove (
.data_in (core_req_tag_in[i]),
.data_out (core_req_tag_out[i])
);
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
: core_req_ready_out[i];
end
// memory request handling
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
assign mem_req_ready_in = mem_req_ready_out;
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]};
end
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_idx];
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_tag_in_sel[CORE_TAG_ID_BITS-1:0];
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[WSEL_BITS +: MEM_ADDR_WIDTH];
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_addr_in_sel[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_byteen_in_sel;
mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_data_in_sel;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
end
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_tag_in_sel[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc;
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc;
VX_bits_insert #(
.N (MEM_TAG_OUT_NC_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
) mem_req_tag_bypass_nc_insert (
.data_in (mem_req_tag_bypass),
.sel_in (1'b0),
.data_out (mem_req_tag_bypass_nc)
);
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.POS (NC_TAG_BIT)
) mem_req_tag_in_nc_insert (
.data_in (mem_req_tag_in),
.sel_in (1'b0),
.data_out (mem_req_tag_in_nc)
);
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc;
// core response handling
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_rsp_valid_in;
end else begin
assign is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_insert #(
.N (CORE_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_rsp_tag_in_nc_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_in_nc[i])
);
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_rsp_tag_in),
.data_out (mem_rsp_tag_in_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = 1'b0;
end
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = '0;
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
end
assign core_rsp_valid_out = core_rsp_valid_in | rsp_nc_valid_r;
assign core_rsp_ready_in = core_rsp_ready_out;
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ?
core_rsp_data_in[i] : mem_rsp_data_in[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (UUID_WIDTH != 0) begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0];
end
end
// memory response handling
if (PASSTHRU != 0) begin
assign mem_rsp_valid_out = 1'b0;
end else begin
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT];
end
assign mem_rsp_data_out = mem_rsp_data_in;
VX_bits_remove #(
.N (MEM_TAG_IN_WIDTH + 1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_out_remove (
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH + 1)-1:0]),
.data_out (mem_rsp_tag_out)
);
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in[rsp_idx] && core_rsp_ready_out[rsp_idx]) : mem_rsp_ready_out;
endmodule

368
hw/rtl/cache/VX_cache_cluster.sv vendored Normal file
View File

@@ -0,0 +1,368 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_cluster #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_UNITS = 1,
parameter NUM_INPUTS = 1,
parameter TAG_SEL_IDX = 0,
// Number of requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output register
parameter MEM_OUT_REG = 0
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
VX_cache_perf_if.master cache_perf_if,
`endif
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
VX_mem_bus_if.master mem_bus_if
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE
VX_cache_perf_if perf_cache_unit_if[NUM_CACHES]();
`PERF_CACHE_ADD (cache_perf_if, perf_cache_unit_if, NUM_CACHES);
`endif
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) cache_mem_bus_if[NUM_CACHES]();
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_tmp_if[NUM_INPUTS]();
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
`RESET_RELAY (cache_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_CACHES),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.OUT_REG_REQ ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.OUT_REG_RSP ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.clk (clk),
.reset (cache_arb_reset),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
for (genvar k = 0; k < NUM_CACHES; ++k) begin
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
end
end
for (genvar i = 0; i < NUM_CACHES; ++i) begin
`RESET_RELAY (cache_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.CORE_OUT_REG ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_REG),
.MEM_OUT_REG ((NUM_CACHES > 1) ? 2 : MEM_OUT_REG),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU)
) cache_wrap (
`ifdef PERF_ENABLE
.cache_perf_if (perf_cache_unit_if[i]),
`endif
.clk (clk),
.reset (cache_reset),
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
.mem_bus_if (cache_mem_bus_if[i])
);
end
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ ((NUM_CACHES > 1) ? 2 : 0),
.OUT_REG_RSP ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
endmodule
///////////////////////////////////////////////////////////////////////////////
module VX_cache_cluster_top #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_UNITS = 2,
parameter NUM_INPUTS = 4,
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = 16,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 1,
// Core response output register
parameter CORE_OUT_REG = 2,
// Memory request output register
parameter MEM_OUT_REG = 2,
parameter NUM_CACHES = `UP(NUM_UNITS),
parameter PASSTHRU = (NUM_UNITS == 0),
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS))
) (
input wire clk,
input wire reset,
// Core request
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
// Core response
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_if[NUM_INPUTS * NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_if();
// Core request
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
end
end
// Core response
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
end
end
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache_cluster #(
.INSTANCE_ID (INSTANCE_ID),
.NUM_UNITS (NUM_UNITS),
.NUM_INPUTS (NUM_INPUTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.WRITE_ENABLE (WRITE_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG),
.MEM_OUT_REG (MEM_OUT_REG)
) cache (
`ifdef PERF_ENABLE
.cache_perf_if (perf_icache_if),
`endif
.clk (clk),
.reset (reset),
.core_bus_if (core_bus_if),
.mem_bus_if (mem_bus_if)
);
endmodule

152
hw/rtl/cache/VX_cache_data.sv vendored Normal file
View File

@@ -0,0 +1,152 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_data #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
input wire read,
input wire fill,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (read)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
end
end
assign wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
end
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_enc (
.data_in (way_sel),
.data_out (way_idx),
`UNUSED_PIN (valid_out)
);
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.read (1'b1),
.write (write || fill),
.wren (wren),
.addr (line_sel),
.wdata (wdata),
.rdata (rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
end
assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall)
`ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
end
end
`endif
endmodule

View File

@@ -1,72 +1,65 @@
`ifndef VX_CACHE_DEFINE
`define VX_CACHE_DEFINE
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`ifndef VX_CACHE_DEFINE_VH
`define VX_CACHE_DEFINE_VH
// cache request identifier
`define DBG_CACHE_REQ_IDW 44
`include "VX_define.vh"
`define REQS_BITS `LOG2UP(NUM_REQS)
`define CS_REQ_SEL_BITS `CLOG2(NUM_REQS)
`define PORTS_BITS `LOG2UP(NUM_PORTS)
`define CS_WORD_WIDTH (8 * WORD_SIZE)
`define CS_LINE_WIDTH (8 * LINE_SIZE)
`define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS)
`define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS)
// tag valid tid word_sel
`define MSHR_DATA_WIDTH ((CORE_TAG_WIDTH + 1 + `REQS_BITS + `UP(`WORD_SELECT_BITS)) * NUM_PORTS)
`define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS))
`define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE)
`define WORD_WIDTH (8 * WORD_SIZE)
`define CACHE_LINE_WIDTH (8 * CACHE_LINE_SIZE)
`define BANK_SIZE (CACHE_SIZE / NUM_BANKS)
`define LINES_PER_BANK (`BANK_SIZE / CACHE_LINE_SIZE)
`define WORDS_PER_LINE (CACHE_LINE_SIZE / WORD_SIZE)
`define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE))
`define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(NUM_BANKS))
`define CS_WORD_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(WORD_SIZE))
`define CS_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(LINE_SIZE))
`define CS_LINE_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH-`CLOG2(NUM_BANKS))
// Word select
`define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE)
`define WORD_SELECT_ADDR_START 0
`define WORD_SELECT_ADDR_END (`WORD_SELECT_ADDR_START+`WORD_SELECT_BITS-1)
`define CS_WORD_SEL_BITS `CLOG2(`CS_WORDS_PER_LINE)
`define CS_WORD_SEL_ADDR_START 0
`define CS_WORD_SEL_ADDR_END (`CS_WORD_SEL_ADDR_START+`CS_WORD_SEL_BITS-1)
// Bank select
`define BANK_SELECT_BITS `CLOG2(NUM_BANKS)
`define BANK_SELECT_ADDR_START (1+`WORD_SELECT_ADDR_END+BANK_ADDR_OFFSET)
`define BANK_SELECT_ADDR_END (`BANK_SELECT_ADDR_START+`BANK_SELECT_BITS-1)
`define CS_BANK_SEL_BITS `CLOG2(NUM_BANKS)
`define CS_BANK_SEL_ADDR_START (1+`CS_WORD_SEL_ADDR_END)
`define CS_BANK_SEL_ADDR_END (`CS_BANK_SEL_ADDR_START+`CS_BANK_SEL_BITS-1)
// Line select
`define LINE_SELECT_BITS `CLOG2(`LINES_PER_BANK)
`define LINE_SELECT_ADDR_START (1+`BANK_SELECT_ADDR_END)
`define LINE_SELECT_ADDR_END (`LINE_SELECT_ADDR_START-BANK_ADDR_OFFSET+`LINE_SELECT_BITS-1)
`define CS_LINE_SEL_BITS `CLOG2(`CS_LINES_PER_BANK)
`define CS_LINE_SEL_ADDR_START (1+`CS_BANK_SEL_ADDR_END)
`define CS_LINE_SEL_ADDR_END (`CS_LINE_SEL_ADDR_START+`CS_LINE_SEL_BITS-1)
// Tag select
`define TAG_SELECT_BITS (`WORD_ADDR_WIDTH-1-`LINE_SELECT_ADDR_END)
`define TAG_SELECT_ADDR_START (1+`LINE_SELECT_ADDR_END)
`define TAG_SELECT_ADDR_END (`WORD_ADDR_WIDTH-1)
`define CS_TAG_SEL_BITS (`CS_WORD_ADDR_WIDTH-1-`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define SELECT_BANK_ID(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START]
`define SELECT_LINE_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START]
`define SELECT_LINE_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]}
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
`define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
///////////////////////////////////////////////////////////////////////////////
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS]
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
`define MEM_ADDR_TO_BANK_ID(x) x[0 +: `BANK_SELECT_BITS]
`define MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `BANK_SELECT_BITS]
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
`endif
`endif // VX_CACHE_DEFINE_VH

51
hw/rtl/cache/VX_cache_init.sv vendored Normal file
View File

@@ -0,0 +1,51 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1
) (
input wire clk,
input wire reset,
output wire [`CS_LINE_SEL_BITS-1:0] addr_out,
output wire valid_out
);
reg enabled;
reg [`CS_LINE_SEL_BITS-1:0] line_ctr;
always @(posedge clk) begin
if (reset) begin
enabled <= 1;
line_ctr <= '0;
end else begin
if (enabled) begin
if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
enabled <= 0;
end
line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1);
end
end
end
assign addr_out = line_ctr;
assign valid_out = enabled;
endmodule

271
hw/rtl/cache/VX_cache_mshr.sv vendored Normal file
View File

@@ -0,0 +1,271 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
// this is an implementation of a pipelined multi-banked cache
// we allocate a free slot from the MSHR before processing a core request
// and release the slot when we get a cache hit.
// during a memory fill response we initiate the replay sequence
// and dequeue all associated pending entries.
// Warning: This MSHR implementation is strongly coupled with the bank pipeline
// and as such changes to either module requires careful evaluation.
// This implementation makes the following assumptions:
// (1) two-cycle pipeline: st0 and st1.
// (2) core request flow: st0: allocate / lookup, st1: finalize.
// (3) the first dequeue after the fill should happen in st0, when the fill is in st1
// this is enforced inside the bank by "rdw_hazard_st0".
module VX_cache_mshr #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 4,
// Request debug identifier
parameter UUID_WIDTH = 0,
// MSHR parameters
parameter DATA_WIDTH = 1,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE)
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
`IGNORE_UNUSED_END
// allocate
input wire allocate_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_tail,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_matches,
// memory fill
input wire fill_valid,
input wire [MSHR_ADDR_WIDTH-1:0] fill_id,
output wire [`CS_LINE_ADDR_WIDTH-1:0] fill_addr,
// dequeue
output wire dequeue_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] dequeue_addr,
output wire dequeue_rw,
output wire [DATA_WIDTH-1:0] dequeue_data,
output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id,
input wire dequeue_ready,
// finalize
input wire finalize_valid,
input wire finalize_release,
input wire finalize_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_tail
);
`UNUSED_PARAM (BANK_ID)
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0];
reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0];
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n;
reg [MSHR_SIZE-1:0] write_table;
reg allocate_rdy, allocate_rdy_n;
reg [MSHR_ADDR_WIDTH-1:0] allocate_id_r, allocate_id_n;
reg dequeue_val, dequeue_val_n;
reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n;
wire [MSHR_ADDR_WIDTH-1:0] tail_idx;
wire allocate_fire = allocate_valid && allocate_ready;
wire dequeue_fire = dequeue_valid && dequeue_ready;
wire [MSHR_SIZE-1:0] addr_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr);
end
VX_lzc #(
.N (MSHR_SIZE),
.REVERSE (1)
) allocate_sel (
.data_in (~valid_table_n),
.data_out (allocate_id_n),
.valid_out (allocate_rdy_n)
);
VX_onehot_encoder #(
.N (MSHR_SIZE)
) tail_sel (
.data_in (addr_matches & ~next_table_x),
.data_out (tail_idx),
`UNUSED_PIN (valid_out)
);
always @(*) begin
valid_table_n = valid_table;
next_table_x = next_table;
dequeue_val_n = dequeue_val;
dequeue_id_n = dequeue_id;
if (fill_valid) begin
dequeue_val_n = 1;
dequeue_id_n = fill_id;
end
if (dequeue_fire) begin
valid_table_n[dequeue_id] = 0;
if (next_table[dequeue_id]) begin
dequeue_id_n = next_index[dequeue_id];
end else begin
dequeue_val_n = 0;
end
end
if (finalize_valid) begin
if (finalize_release) begin
valid_table_n[finalize_id] = 0;
end
if (finalize_pending) begin
next_table_x[finalize_tail] = 1;
end
end
next_table_n = next_table_x;
if (allocate_fire) begin
valid_table_n[allocate_id] = 1;
next_table_n[allocate_id] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
valid_table <= '0;
allocate_rdy <= 0;
dequeue_val <= 0;
end else begin
valid_table <= valid_table_n;
allocate_rdy <= allocate_rdy_n;
dequeue_val <= dequeue_val_n;
end
if (allocate_fire) begin
addr_table[allocate_id] <= allocate_addr;
write_table[allocate_id] <= allocate_rw;
end
if (finalize_valid && finalize_pending) begin
next_index[finalize_tail] <= finalize_id;
end
dequeue_id_r <= dequeue_id_n;
allocate_id_r <= allocate_id_n;
next_table <= next_table_n;
end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s-bank%0d inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s-bank%0d invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s-bank%0d invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
.DATAW (DATA_WIDTH),
.SIZE (MSHR_SIZE),
.LUTRAM (1)
) entries (
.clk (clk),
.read (1'b1),
.write (allocate_valid),
`UNUSED_PIN (wren),
.waddr (allocate_id_r),
.wdata (allocate_data),
.raddr (dequeue_id_r),
.rdata (dequeue_data)
);
assign fill_addr = addr_table[fill_id];
assign allocate_ready = allocate_rdy;
assign allocate_id = allocate_id_r;
assign allocate_tail = tail_idx;
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table;
`UNUSED_VAR (lookup_valid)
`ifdef DBG_TRACE_CACHE_MSHR
reg show_table;
always @(posedge clk) begin
if (reset) begin
show_table <= 0;
end else begin
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-allocate: addr=0x%0h, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_tail, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_matches, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-finalize release=%b, pending=%b, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
finalize_release, finalize_pending, finalize_tail, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin
`TRACE(3, ("%d: %s-bank%0d mshr-table", $time, INSTANCE_ID, BANK_ID));
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
if (write_table[i])
`TRACE(3, ("(w)"));
else
`TRACE(3, ("(r)"));
if (next_table[i])
`TRACE(3, ("->%0d", next_index[i]));
end
end
`TRACE(3, ("\n"));
end
end
`endif
endmodule

View File

@@ -1,9 +1,19 @@
`ifndef VX_PERF_CACHE_IF
`define VX_PERF_CACHE_IF
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_perf_cache_if ();
interface VX_cache_perf_if ();
wire [`PERF_CTR_BITS-1:0] reads;
wire [`PERF_CTR_BITS-1:0] writes;
@@ -37,5 +47,3 @@ interface VX_perf_cache_if ();
);
endinterface
`endif

116
hw/rtl/cache/VX_cache_tags.sv vendored Normal file
View File

@@ -0,0 +1,116 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_tags #(
parameter `STRING INSTANCE_ID = "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire [`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
// read/fill
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill,
input wire init,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr);
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]};
end
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i];
end
end else begin
`UNUSED_VAR (stall)
assign way_sel = fill;
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid;
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (way_sel[i] || init),
`UNUSED_PIN (wren),
.addr (line_sel),
.wdata ({~init, line_tag}),
.rdata ({read_valid, read_tag})
);
assign tag_matches[i] = read_valid && (line_tag == read_tag);
end
`ifdef DBG_TRACE_CACHE_TAG
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d tag-fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag));
end
if (init) begin
`TRACE(3, ("%d: %s-bank%0d tag-init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s-bank%0d tag-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid));
end else begin
`TRACE(3, ("%d: %s-bank%0d tag-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end
`endif
endmodule

501
hw/rtl/cache/VX_cache_wrap.sv vendored Normal file
View File

@@ -0,0 +1,501 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_wrap #(
parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// enable bypass for non-cacheable addresses
parameter NC_TAG_BIT = 0,
parameter NC_ENABLE = 0,
// Force bypass for all requests
parameter PASSTHRU = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output register
parameter MEM_OUT_REG = 0
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
VX_cache_perf_if.master cache_perf_if,
`endif
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CORE_TAG_X_WIDTH = TAG_WIDTH - NC_ENABLE;
localparam MEM_TAG_X_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam NC_BYPASS = (NC_ENABLE || PASSTHRU);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
end
///////////////////////////////////////////////////////////////////////////
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY (core_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(CORE_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(CORE_OUT_REG))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory request buffering
wire mem_req_valid_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_ready_s;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(MEM_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(MEM_OUT_REG))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
///////////////////////////////////////////////////////////////////////////
// Core request
wire [NUM_REQS-1:0] core_req_valid_b;
wire [NUM_REQS-1:0] core_req_rw_b;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr_b;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_b;
wire [NUM_REQS-1:0] core_req_ready_b;
// Core response
wire [NUM_REQS-1:0] core_rsp_valid_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_b;
wire [NUM_REQS-1:0] core_rsp_ready_b;
// Memory request
wire mem_req_valid_b;
wire mem_req_rw_b;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [LINE_SIZE-1:0] mem_req_byteen_b;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag_b;
wire mem_req_ready_b;
// Memory response
wire mem_rsp_valid_b;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag_b;
wire mem_rsp_ready_b;
if (NC_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.NC_TAG_BIT (NC_TAG_BIT),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_DATA_SIZE (WORD_SIZE),
.CORE_TAG_IN_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (LINE_SIZE),
.MEM_TAG_IN_WIDTH (MEM_TAG_X_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH)
) cache_bypass (
.clk (clk),
.reset (nc_bypass_reset),
// Core request in
.core_req_valid_in (core_req_valid),
.core_req_rw_in (core_req_rw),
.core_req_byteen_in (core_req_byteen),
.core_req_addr_in (core_req_addr),
.core_req_data_in (core_req_data),
.core_req_tag_in (core_req_tag),
.core_req_ready_in (core_req_ready),
// Core request out
.core_req_valid_out (core_req_valid_b),
.core_req_rw_out (core_req_rw_b),
.core_req_byteen_out(core_req_byteen_b),
.core_req_addr_out (core_req_addr_b),
.core_req_data_out (core_req_data_b),
.core_req_tag_out (core_req_tag_b),
.core_req_ready_out (core_req_ready_b),
// Core response in
.core_rsp_valid_in (core_rsp_valid_b),
.core_rsp_data_in (core_rsp_data_b),
.core_rsp_tag_in (core_rsp_tag_b),
.core_rsp_ready_in (core_rsp_ready_b),
// Core response out
.core_rsp_valid_out (core_rsp_valid_s),
.core_rsp_data_out (core_rsp_data_s),
.core_rsp_tag_out (core_rsp_tag_s),
.core_rsp_ready_out (core_rsp_ready_s),
// Memory request in
.mem_req_valid_in (mem_req_valid_b),
.mem_req_rw_in (mem_req_rw_b),
.mem_req_addr_in (mem_req_addr_b),
.mem_req_byteen_in (mem_req_byteen_b),
.mem_req_data_in (mem_req_data_b),
.mem_req_tag_in (mem_req_tag_b),
.mem_req_ready_in (mem_req_ready_b),
// Memory request out
.mem_req_valid_out (mem_req_valid_s),
.mem_req_addr_out (mem_req_addr_s),
.mem_req_rw_out (mem_req_rw_s),
.mem_req_byteen_out (mem_req_byteen_s),
.mem_req_data_out (mem_req_data_s),
.mem_req_tag_out (mem_req_tag_s),
.mem_req_ready_out (mem_req_ready_s),
// Memory response in
.mem_rsp_valid_in (mem_bus_if.rsp_valid),
.mem_rsp_data_in (mem_bus_if.rsp_data.data),
.mem_rsp_tag_in (mem_bus_if.rsp_data.tag),
.mem_rsp_ready_in (mem_bus_if.rsp_ready),
// Memory response out
.mem_rsp_valid_out (mem_rsp_valid_b),
.mem_rsp_data_out (mem_rsp_data_b),
.mem_rsp_tag_out (mem_rsp_tag_b),
.mem_rsp_ready_out (mem_rsp_ready_b)
);
end else begin
assign core_req_valid_b = core_req_valid;
assign core_req_rw_b = core_req_rw;
assign core_req_addr_b = core_req_addr;
assign core_req_byteen_b= core_req_byteen;
assign core_req_data_b = core_req_data;
assign core_req_tag_b = core_req_tag;
assign core_req_ready = core_req_ready_b;
assign core_rsp_valid_s = core_rsp_valid_b;
assign core_rsp_data_s = core_rsp_data_b;
assign core_rsp_tag_s = core_rsp_tag_b;
assign core_rsp_ready_b = core_rsp_ready_s;
assign mem_req_valid_s = mem_req_valid_b;
assign mem_req_addr_s = mem_req_addr_b;
assign mem_req_rw_s = mem_req_rw_b;
assign mem_req_byteen_s = mem_req_byteen_b;
assign mem_req_data_s = mem_req_data_b;
assign mem_req_ready_b = mem_req_ready_s;
// Add explicit NC=0 flag to the memory request tag
VX_bits_insert #(
.N (MEM_TAG_WIDTH-1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_req_tag_b),
.sel_in (1'b0),
.data_out (mem_req_tag_s)
);
assign mem_rsp_valid_b = mem_bus_if.rsp_valid;
assign mem_rsp_data_b = mem_bus_if.rsp_data.data;
assign mem_bus_if.rsp_ready = mem_rsp_ready_b;
// Remove NC flag from the memory response tag
VX_bits_remove #(
.N (MEM_TAG_WIDTH),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_bus_if.rsp_data.tag),
.data_out (mem_rsp_tag_b)
);
end
if (PASSTHRU != 0) begin
`UNUSED_VAR (core_req_valid_b)
`UNUSED_VAR (core_req_rw_b)
`UNUSED_VAR (core_req_addr_b)
`UNUSED_VAR (core_req_byteen_b)
`UNUSED_VAR (core_req_data_b)
`UNUSED_VAR (core_req_tag_b)
assign core_req_ready_b = '0;
assign core_rsp_valid_b = '0;
assign core_rsp_data_b = '0;
assign core_rsp_tag_b = '0;
`UNUSED_VAR (core_rsp_ready_b)
assign mem_req_valid_b = 0;
assign mem_req_addr_b = '0;
assign mem_req_rw_b = '0;
assign mem_req_byteen_b = '0;
assign mem_req_data_b = '0;
assign mem_req_tag_b = '0;
`UNUSED_VAR (mem_req_ready_b)
`UNUSED_VAR (mem_rsp_valid_b)
`UNUSED_VAR (mem_rsp_data_b)
`UNUSED_VAR (mem_rsp_tag_b)
assign mem_rsp_ready_b = 0;
`ifdef PERF_ENABLE
assign cache_perf_if.reads = '0;
assign cache_perf_if.writes = '0;
assign cache_perf_if.read_misses = '0;
assign cache_perf_if.write_misses = '0;
assign cache_perf_if.bank_stalls = '0;
assign cache_perf_if.mshr_stalls = '0;
assign cache_perf_if.mem_stalls = '0;
assign cache_perf_if.crsp_stalls = '0;
`endif
end else begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_X_WIDTH)
) core_bus_wrap_if[NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_wrap_if();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_wrap_if[i].req_valid = core_req_valid_b[i];
assign core_bus_wrap_if[i].req_data.rw = core_req_rw_b[i];
assign core_bus_wrap_if[i].req_data.addr = core_req_addr_b[i];
assign core_bus_wrap_if[i].req_data.byteen = core_req_byteen_b[i];
assign core_bus_wrap_if[i].req_data.data = core_req_data_b[i];
assign core_bus_wrap_if[i].req_data.tag = core_req_tag_b[i];
assign core_req_ready_b[i] = core_bus_wrap_if[i].req_ready;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_valid_b[i] = core_bus_wrap_if[i].rsp_valid;
assign core_rsp_data_b[i] = core_bus_wrap_if[i].rsp_data.data;
assign core_rsp_tag_b[i] = core_bus_wrap_if[i].rsp_data.tag;
assign core_bus_wrap_if[i].rsp_ready = core_rsp_ready_b[i];
end
assign mem_req_valid_b = mem_bus_wrap_if.req_valid;
assign mem_req_addr_b = mem_bus_wrap_if.req_data.addr;
assign mem_req_rw_b = mem_bus_wrap_if.req_data.rw;
assign mem_req_byteen_b = mem_bus_wrap_if.req_data.byteen;
assign mem_req_data_b = mem_bus_wrap_if.req_data.data;
assign mem_req_tag_b = mem_bus_wrap_if.req_data.tag;
assign mem_bus_wrap_if.req_ready = mem_req_ready_b;
assign mem_bus_wrap_if.rsp_valid = mem_rsp_valid_b;
assign mem_bus_wrap_if.rsp_data.data = mem_rsp_data_b;
assign mem_bus_wrap_if.rsp_data.tag = mem_rsp_tag_b;
assign mem_rsp_ready_b = mem_bus_wrap_if.rsp_ready;
`RESET_RELAY (cache_reset, reset);
VX_cache #(
.INSTANCE_ID (INSTANCE_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (CORE_TAG_X_WIDTH),
.CORE_OUT_REG (NC_BYPASS ? 1 : CORE_OUT_REG),
.MEM_OUT_REG (NC_BYPASS ? 1 : MEM_OUT_REG)
) cache (
.clk (clk),
.reset (cache_reset),
`ifdef PERF_ENABLE
.cache_perf_if (cache_perf_if),
`endif
.core_bus_if (core_bus_wrap_if),
.mem_bus_if (mem_bus_wrap_if)
);
end
`ifdef DBG_TRACE_CACHE_BANK
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
if (UUID_WIDTH != 0) begin
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign core_req_uuid = 0;
assign core_rsp_uuid = 0;
end
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign mem_req_uuid = 0;
assign mem_rsp_uuid = 0;
end
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end
end
`endif
endmodule

View File

@@ -1,314 +0,0 @@
`include "VX_cache_define.vh"
module VX_core_req_bank_sel #(
parameter CACHE_ID = 0,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 64,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Number of banks
parameter NUM_BANKS = 4,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// core request tag size
parameter CORE_TAG_WIDTH = 3,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] bank_stalls,
`endif
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
output wire [NUM_BANKS-1:0] per_bank_core_req_valid,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_pmask,
output wire [NUM_BANKS-1:0] per_bank_core_req_rw,
output wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid,
output wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag,
input wire [NUM_BANKS-1:0] per_bank_core_req_ready
);
`UNUSED_PARAM (CACHE_ID)
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
`STATIC_ASSERT(NUM_PORTS <= NUM_BANKS, ("invalid value"))
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [NUM_REQS-1:0][`LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][`UP(`WORD_SELECT_BITS)-1:0] core_req_wsel;
wire [NUM_REQS-1:0][`UP(`BANK_SELECT_BITS)-1:0] core_req_bid;
for (genvar i = 0; i < NUM_REQS; i++) begin
if (BANK_ADDR_OFFSET == 0) begin
assign core_req_line_addr[i] = `SELECT_LINE_ADDR0(core_req_addr[i]);
end else begin
assign core_req_line_addr[i] = `SELECT_LINE_ADDRX(core_req_addr[i]);
end
assign core_req_wsel[i] = core_req_addr[i][`UP(`WORD_SELECT_BITS)-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (NUM_BANKS > 1) begin
assign core_req_bid[i] = `SELECT_BANK_ID(core_req_addr[i]);
end else begin
assign core_req_bid[i] = 0;
end
end
reg [NUM_BANKS-1:0] per_bank_core_req_valid_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_pmask_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_r;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r;
reg [NUM_BANKS-1:0] per_bank_core_req_rw_r;
reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_r;
reg [NUM_REQS-1:0] core_req_ready_r;
if (NUM_REQS > 1) begin
if (NUM_PORTS > 1) begin
reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_line_addr_r;
reg [NUM_BANKS-1:0] per_bank_rw_r;
wire [NUM_REQS-1:0] core_req_line_match;
always @(*) begin
per_bank_line_addr_r = 'x;
per_bank_rw_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
per_bank_line_addr_r[core_req_bid[i]] = core_req_line_addr[i];
per_bank_rw_r[core_req_bid[i]] = core_req_rw[i];
end
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]])
&& (core_req_rw[i] == per_bank_rw_r[core_req_bid[i]]);
end
if (NUM_PORTS < NUM_REQS) begin
reg [NUM_BANKS-1:0][NUM_PORTS-1:0][NUM_REQS-1:0] req_select_table_r;
always @(*) begin
per_bank_core_req_valid_r = 0;
per_bank_core_req_pmask_r = 0;
per_bank_core_req_rw_r = 'x;
per_bank_core_req_addr_r = 'x;
per_bank_core_req_wsel_r = 'x;
per_bank_core_req_byteen_r= 'x;
per_bank_core_req_data_r = 'x;
per_bank_core_req_tag_r = 'x;
per_bank_core_req_tid_r = 'x;
req_select_table_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
per_bank_core_req_valid_r[core_req_bid[i]] = 1;
per_bank_core_req_pmask_r[core_req_bid[i]][i % NUM_PORTS] = core_req_line_match[i];
per_bank_core_req_wsel_r[core_req_bid[i]][i % NUM_PORTS] = core_req_wsel[i];
per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i];
per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i];
per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i);
per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i];
per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
req_select_table_r[core_req_bid[i]][i % NUM_PORTS] = (1 << i);
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i]
&& req_select_table_r[core_req_bid[i]][i % NUM_PORTS][i];
end
end
end else begin
always @(*) begin
per_bank_core_req_valid_r = 0;
per_bank_core_req_pmask_r = 0;
per_bank_core_req_rw_r = 'x;
per_bank_core_req_addr_r = 'x;
per_bank_core_req_wsel_r = 'x;
per_bank_core_req_byteen_r= 'x;
per_bank_core_req_data_r = 'x;
per_bank_core_req_tag_r = 'x;
per_bank_core_req_tid_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
per_bank_core_req_valid_r[core_req_bid[i]] = 1;
per_bank_core_req_pmask_r[core_req_bid[i]][i % NUM_PORTS] = core_req_line_match[i];
per_bank_core_req_wsel_r[core_req_bid[i]][i % NUM_PORTS] = core_req_wsel[i];
per_bank_core_req_byteen_r[core_req_bid[i]][i % NUM_PORTS] = core_req_byteen[i];
per_bank_core_req_data_r[core_req_bid[i]][i % NUM_PORTS] = core_req_data[i];
per_bank_core_req_tid_r[core_req_bid[i]][i % NUM_PORTS] = `REQS_BITS'(i);
per_bank_core_req_tag_r[core_req_bid[i]][i % NUM_PORTS] = core_req_tag[i];
per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
core_req_ready_r[i] = per_bank_core_req_ready[core_req_bid[i]]
&& core_req_line_match[i];
end
end
end
end else begin
always @(*) begin
per_bank_core_req_valid_r = 0;
per_bank_core_req_rw_r = 'x;
per_bank_core_req_addr_r = 'x;
per_bank_core_req_wsel_r = 'x;
per_bank_core_req_byteen_r= 'x;
per_bank_core_req_data_r = 'x;
per_bank_core_req_tag_r = 'x;
per_bank_core_req_tid_r = 'x;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (core_req_valid[i]) begin
per_bank_core_req_valid_r[core_req_bid[i]] = 1;
per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i];
per_bank_core_req_addr_r[core_req_bid[i]] = core_req_line_addr[i];
per_bank_core_req_wsel_r[core_req_bid[i]] = core_req_wsel[i];
per_bank_core_req_byteen_r[core_req_bid[i]]= core_req_byteen[i];
per_bank_core_req_data_r[core_req_bid[i]] = core_req_data[i];
per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i];
per_bank_core_req_tid_r[core_req_bid[i]] = `REQS_BITS'(i);
end
end
per_bank_core_req_pmask_r = per_bank_core_req_valid_r;
end
if (NUM_BANKS > 1) begin
always @(*) begin
core_req_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_r[i]) begin
core_req_ready_r[per_bank_core_req_tid_r[i]] = per_bank_core_req_ready[i];
end
end
end
end else begin
always @(*) begin
core_req_ready_r = 0;
core_req_ready_r[per_bank_core_req_tid_r[0]] = per_bank_core_req_ready;
end
end
end
end else begin
if (NUM_BANKS > 1) begin
always @(*) begin
per_bank_core_req_valid_r = 0;
per_bank_core_req_rw_r = 'x;
per_bank_core_req_addr_r = 'x;
per_bank_core_req_wsel_r = 'x;
per_bank_core_req_byteen_r= 'x;
per_bank_core_req_data_r = 'x;
per_bank_core_req_tag_r = 'x;
per_bank_core_req_tid_r = 'x;
per_bank_core_req_valid_r[core_req_bid[0]] = core_req_valid;
per_bank_core_req_rw_r[core_req_bid[0]] = core_req_rw;
per_bank_core_req_addr_r[core_req_bid[0]] = core_req_line_addr;
per_bank_core_req_wsel_r[core_req_bid[0]] = core_req_wsel;
per_bank_core_req_byteen_r[core_req_bid[0]] = core_req_byteen;
per_bank_core_req_data_r[core_req_bid[0]] = core_req_data;
per_bank_core_req_tag_r[core_req_bid[0]] = core_req_tag;
per_bank_core_req_tid_r[core_req_bid[0]] = 0;
core_req_ready_r = per_bank_core_req_ready[core_req_bid[0]];
per_bank_core_req_pmask_r = per_bank_core_req_valid_r;
end
end else begin
`UNUSED_VAR (core_req_bid)
always @(*) begin
per_bank_core_req_valid_r = core_req_valid;
per_bank_core_req_rw_r = core_req_rw;
per_bank_core_req_addr_r = core_req_line_addr;
per_bank_core_req_wsel_r = core_req_wsel;
per_bank_core_req_byteen_r = core_req_byteen;
per_bank_core_req_data_r = core_req_data;
per_bank_core_req_tag_r = core_req_tag;
per_bank_core_req_tid_r = 0;
core_req_ready_r = per_bank_core_req_ready;
per_bank_core_req_pmask_r = per_bank_core_req_valid_r;
end
end
end
assign per_bank_core_req_valid = per_bank_core_req_valid_r;
assign per_bank_core_req_pmask = per_bank_core_req_pmask_r;
assign per_bank_core_req_rw = per_bank_core_req_rw_r;
assign per_bank_core_req_addr = per_bank_core_req_addr_r;
assign per_bank_core_req_wsel = per_bank_core_req_wsel_r;
assign per_bank_core_req_byteen = per_bank_core_req_byteen_r;
assign per_bank_core_req_data = per_bank_core_req_data_r;
assign per_bank_core_req_tag = per_bank_core_req_tag_r;
assign per_bank_core_req_tid = per_bank_core_req_tid_r;
assign core_req_ready = core_req_ready_r;
`ifdef PERF_ENABLE
reg [NUM_REQS-1:0] core_req_sel_r;
always @(*) begin
core_req_sel_r = 0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i]) begin
core_req_sel_r[i] = per_bank_core_req_ready[core_req_bid[i]];
end
end
end
reg [`PERF_CTR_BITS-1:0] bank_stalls_r;
wire [$clog2(NUM_REQS+1)-1:0] bank_stall_cnt;
wire [NUM_REQS-1:0] bank_stall_mask = core_req_sel_r & ~core_req_ready;
`POP_COUNT(bank_stall_cnt, bank_stall_mask);
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'(bank_stall_cnt);
end
end
assign bank_stalls = bank_stalls_r;
`endif
endmodule

View File

@@ -1,350 +0,0 @@
`include "VX_cache_define.vh"
module VX_core_rsp_merge #(
parameter CACHE_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
// output register
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
// Per Bank WB
input wire [NUM_BANKS-1:0] per_bank_core_rsp_valid,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_pmask,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`REQS_BITS-1:0] per_bank_core_rsp_tid,
input wire [NUM_BANKS-1:0][NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag,
output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready,
// Core Response
output wire [`CORE_RSP_TAGS-1:0] core_rsp_valid,
output wire [NUM_REQS-1:0] core_rsp_tmask,
output wire [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
input wire [`CORE_RSP_TAGS-1:0] core_rsp_ready
);
`UNUSED_PARAM (CACHE_ID)
if (NUM_BANKS > 1) begin
reg [NUM_REQS-1:0] core_rsp_valid_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
reg [NUM_BANKS-1:0] per_bank_core_rsp_ready_r;
if (CORE_TAG_ID_BITS != 0) begin
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
wire [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire core_rsp_ready_unqual;
if (NUM_PORTS > 1) begin
reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i];
end
always @(posedge clk) begin
if (reset) begin
per_bank_core_rsp_sent_r <= '0;
end else begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin
per_bank_core_rsp_sent_r[i] <= '0;
end else begin
per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i];
end
end
end
end
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_valid_p;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar p = 0; p < NUM_PORTS; ++p) begin
assign per_bank_core_rsp_valid_p[i][p] = per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p];
end
end
VX_find_first #(
.N (NUM_BANKS * NUM_PORTS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (per_bank_core_rsp_valid_p),
.data_i (per_bank_core_rsp_tag),
.data_o (core_rsp_tag_unqual),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
per_bank_core_rsp_sent = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
for (integer p = 0; p < NUM_PORTS; ++p) begin
if (per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p]
&& (per_bank_core_rsp_tag[i][p][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p];
per_bank_core_rsp_sent[i][p] = core_rsp_ready_unqual;
end
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]);
end
end
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (per_bank_core_rsp_valid),
.data_i (per_bank_core_rsp_tag),
.data_o (core_rsp_tag_unqual),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_data_unqual = 'x;
per_bank_core_rsp_ready_r = 0;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_rsp_valid[i]
&& (per_bank_core_rsp_tag[i][0][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual;
end
end
end
end
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_any),
.data_in ({core_rsp_valid_unqual, core_rsp_tag_unqual, core_rsp_data_unqual}),
.ready_in (core_rsp_ready_unqual),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tmask, core_rsp_tag, core_rsp_data}),
.ready_out (core_rsp_ready)
);
end else begin
reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire [NUM_REQS-1:0] core_rsp_ready_unqual;
if (NUM_PORTS > 1) begin
reg [NUM_REQS-1:0][(`PORTS_BITS + `BANK_SELECT_BITS)-1:0] bank_select_table;
reg [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_r, per_bank_core_rsp_sent;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_rsp_sent_n;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_core_rsp_sent_n[i] = per_bank_core_rsp_sent_r[i] | per_bank_core_rsp_sent[i];
end
always @(posedge clk) begin
if (reset) begin
per_bank_core_rsp_sent_r <= '0;
end else begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]) begin
per_bank_core_rsp_sent_r[i] <= '0;
end else begin
per_bank_core_rsp_sent_r[i] <= per_bank_core_rsp_sent_n[i];
end
end
end
end
always @(*) begin
core_rsp_valid_unqual = '0;
core_rsp_tag_unqual = 'x;
core_rsp_data_unqual = 'x;
bank_select_table = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
for (integer p = 0; p < NUM_PORTS; ++p) begin
if (per_bank_core_rsp_valid[i]
&& per_bank_core_rsp_pmask[i][p]
&& !per_bank_core_rsp_sent_r[i][p]) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i][p]] = 1;
core_rsp_tag_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_tag[i][p];
core_rsp_data_unqual[per_bank_core_rsp_tid[i][p]] = per_bank_core_rsp_data[i][p];
bank_select_table[per_bank_core_rsp_tid[i][p]] = {`PORTS_BITS'(p), `BANK_SELECT_BITS'(i)};
end
end
end
end
always @(*) begin
per_bank_core_rsp_sent = '0;
for (integer i = 0; i < NUM_REQS; i++) begin
if (core_rsp_valid_unqual[i]) begin
per_bank_core_rsp_sent[bank_select_table[i][0 +: `BANK_SELECT_BITS]][bank_select_table[i][`BANK_SELECT_BITS +: `PORTS_BITS]] = core_rsp_ready_unqual[i];
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; i++) begin
per_bank_core_rsp_ready_r[i] = (per_bank_core_rsp_sent_n[i] == per_bank_core_rsp_pmask[i]);
end
end
end else begin
`UNUSED_VAR (per_bank_core_rsp_pmask)
reg [NUM_REQS-1:0][NUM_BANKS-1:0] bank_select_table;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_tag_unqual = 'x;
core_rsp_data_unqual = 'x;
bank_select_table = 'x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_rsp_valid[i]) begin
core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1;
core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i];
core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i];
bank_select_table[per_bank_core_rsp_tid[i]] = (1 << i);
end
end
end
always @(*) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
per_bank_core_rsp_ready_r[i] = core_rsp_ready_unqual[per_bank_core_rsp_tid[i]]
&& bank_select_table[per_bank_core_rsp_tid[i]][i];
end
end
end
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_unqual[i]),
.data_in ({core_rsp_tag_unqual[i], core_rsp_data_unqual[i]}),
.ready_in (core_rsp_ready_unqual[i]),
.valid_out (core_rsp_valid[i]),
.data_out ({core_rsp_tag[i],core_rsp_data[i]}),
.ready_out (core_rsp_ready[i])
);
end
assign core_rsp_tmask = core_rsp_valid;
end
assign per_bank_core_rsp_ready = per_bank_core_rsp_ready_r;
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (per_bank_core_rsp_pmask)
if (NUM_REQS > 1) begin
reg [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual;
if (CORE_TAG_ID_BITS != 0) begin
reg [NUM_REQS-1:0] core_rsp_tmask_unqual;
always @(*) begin
core_rsp_tmask_unqual = 0;
core_rsp_tmask_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = core_rsp_tmask_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready;
end else begin
reg [`CORE_RSP_TAGS-1:0] core_rsp_valid_unqual;
always @(*) begin
core_rsp_valid_unqual = 0;
core_rsp_valid_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_valid;
core_rsp_tag_unqual = 'x;
core_rsp_tag_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_tag;
core_rsp_data_unqual = 'x;
core_rsp_data_unqual[per_bank_core_rsp_tid] = per_bank_core_rsp_data;
end
assign core_rsp_valid = core_rsp_valid_unqual;
assign core_rsp_tmask = core_rsp_valid_unqual;
assign per_bank_core_rsp_ready = core_rsp_ready[per_bank_core_rsp_tid];
end
assign core_rsp_tag = core_rsp_tag_unqual;
assign core_rsp_data = core_rsp_data_unqual;
end else begin
`UNUSED_VAR(per_bank_core_rsp_tid)
assign core_rsp_valid = per_bank_core_rsp_valid;
assign core_rsp_tmask = per_bank_core_rsp_valid;
assign core_rsp_tag = per_bank_core_rsp_tag;
assign core_rsp_data = per_bank_core_rsp_data;
assign per_bank_core_rsp_ready = core_rsp_ready;
end
end
endmodule

View File

@@ -1,133 +0,0 @@
`include "VX_cache_define.vh"
module VX_data_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
`IGNORE_UNUSED_END
input wire stall,
input wire read,
input wire fill,
input wire write,
input wire[`LINE_ADDR_WIDTH-1:0] addr,
input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel,
input wire [NUM_PORTS-1:0] pmask,
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen,
input wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] fill_data,
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] write_data,
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] read_data
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (addr)
`UNUSED_VAR (read)
localparam BYTEENW = WRITE_ENABLE ? CACHE_LINE_SIZE : 1;
wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] rdata;
wire [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
wire [`LINE_SELECT_BITS-1:0] line_addr = addr[`LINE_SELECT_BITS-1:0];
if (WRITE_ENABLE) begin
if (`WORDS_PER_LINE > 1) begin
reg [`WORDS_PER_LINE-1:0][`WORD_WIDTH-1:0] wdata_r;
reg [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
if (NUM_PORTS > 1) begin
always @(*) begin
wdata_r = 'x;
wren_r = 0;
for (integer i = 0; i < NUM_PORTS; ++i) begin
if (pmask[i]) begin
wdata_r[wsel[i]] = write_data[i];
wren_r[wsel[i]] = byteen[i];
end
end
end
end else begin
`UNUSED_VAR (pmask)
always @(*) begin
wdata_r = {`WORDS_PER_LINE{write_data}};
wren_r = 0;
wren_r[wsel] = byteen;
end
end
assign wdata = write ? wdata_r : fill_data;
assign wren = write ? wren_r : {BYTEENW{fill}};
end else begin
`UNUSED_VAR (wsel)
`UNUSED_VAR (pmask)
assign wdata = write ? write_data : fill_data;
assign wren = write ? byteen : {BYTEENW{fill}};
end
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (pmask)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
end
VX_sp_ram #(
.DATAW (`CACHE_LINE_WIDTH),
.SIZE (`LINES_PER_BANK),
.BYTEENW (BYTEENW),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.addr (line_addr),
.wren (wren),
.wdata (wdata),
.rdata (rdata)
);
if (`WORDS_PER_LINE > 1) begin
for (genvar i = 0; i < NUM_PORTS; ++i) begin
assign read_data[i] = rdata[wsel[i]];
end
end else begin
assign read_data = rdata;
end
`UNUSED_VAR (stall)
`ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);
end
if (read && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, read_data, req_id);
end
if (write && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, byteen=%b, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), byteen, line_addr, write_data, req_id);
end
end
`endif
endmodule

View File

@@ -1,36 +0,0 @@
`include "VX_cache_define.vh"
module VX_flush_ctrl #(
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1
) (
input wire clk,
input wire reset,
output wire [`LINE_SELECT_BITS-1:0] addr_out,
output wire valid_out
);
reg flush_enable;
reg [`LINE_SELECT_BITS-1:0] flush_ctr;
always @(posedge clk) begin
if (reset) begin
flush_enable <= 1;
flush_ctr <= 0;
end else begin
if (flush_enable) begin
if (flush_ctr == ((2 ** `LINE_SELECT_BITS)-1)) begin
flush_enable <= 0;
end
flush_ctr <= flush_ctr + 1;
end
end
end
assign addr_out = flush_ctr;
assign valid_out = flush_enable;
endmodule

View File

@@ -1,234 +0,0 @@
`include "VX_cache_define.vh"
module VX_miss_resrv #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Number of ports per banks
parameter NUM_PORTS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
parameter MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE)
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id,
input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id,
input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id,
`IGNORE_UNUSED_END
// allocate
input wire allocate_valid,
input wire [`LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire [`MSHR_DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire allocate_ready,
// fill
input wire fill_valid,
input wire [MSHR_ADDR_WIDTH-1:0] fill_id,
output wire [`LINE_ADDR_WIDTH-1:0] fill_addr,
// lookup
input wire lookup_valid,
input wire lookup_replay,
input wire [MSHR_ADDR_WIDTH-1:0] lookup_id,
input wire [`LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire lookup_match,
// dequeue
output wire dequeue_valid,
output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id,
output wire [`LINE_ADDR_WIDTH-1:0] dequeue_addr,
output wire [`MSHR_DATA_WIDTH-1:0] dequeue_data,
input wire dequeue_ready,
// release
input wire release_valid,
input wire [MSHR_ADDR_WIDTH-1:0] release_id
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
reg [MSHR_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table, addr_table_n;
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] ready_table, ready_table_n;
reg allocate_rdy_r, allocate_rdy_n;
reg [MSHR_ADDR_WIDTH-1:0] allocate_id_r, allocate_id_n;
reg dequeue_val_r, dequeue_val_n, dequeue_val_x;
reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n, dequeue_id_x;
reg [MSHR_SIZE-1:0] valid_table_x;
reg [MSHR_SIZE-1:0] ready_table_x;
wire [MSHR_SIZE-1:0] addr_matches;
wire allocate_fire = allocate_valid && allocate_ready;
wire dequeue_fire = dequeue_valid && dequeue_ready;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign addr_matches[i] = (addr_table[i] == lookup_addr);
end
always @(*) begin
valid_table_x = valid_table;
ready_table_x = ready_table;
if (dequeue_fire) begin
valid_table_x[dequeue_id] = 0;
end
if (lookup_replay) begin
ready_table_x |= addr_matches;
end
end
VX_lzc #(
.N (MSHR_SIZE)
) dequeue_sel (
.in_i (valid_table_x & ready_table_x),
.cnt_o (dequeue_id_x),
.valid_o (dequeue_val_x)
);
VX_lzc #(
.N (MSHR_SIZE)
) allocate_sel (
.in_i (~valid_table_n),
.cnt_o (allocate_id_n),
.valid_o (allocate_rdy_n)
);
always @(*) begin
valid_table_n = valid_table_x;
ready_table_n = ready_table_x;
addr_table_n = addr_table;
dequeue_val_n = dequeue_val_r;
dequeue_id_n = dequeue_id_r;
if (dequeue_fire) begin
dequeue_val_n = dequeue_val_x;
dequeue_id_n = dequeue_id_x;
end
if (allocate_fire) begin
valid_table_n[allocate_id] = 1;
ready_table_n[allocate_id] = 0;
addr_table_n[allocate_id] = allocate_addr;
end
if (fill_valid) begin
dequeue_val_n = 1;
dequeue_id_n = fill_id;
end
if (release_valid) begin
valid_table_n[release_id] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
allocate_rdy_r <= 0;
dequeue_val_r <= 0;
end else begin
valid_table <= valid_table_n;
allocate_rdy_r <= allocate_rdy_n;
dequeue_val_r <= dequeue_val_n;
end
ready_table <= ready_table_n;
addr_table <= addr_table_n;
dequeue_id_r <= dequeue_id_n;
allocate_id_r <= allocate_id_n;
`ASSERT(!allocate_fire || !valid_table[allocate_id_r], ("runtime error"));
`ASSERT(!release_valid || valid_table[release_id], ("runtime error"));
end
`RUNTIME_ASSERT((!allocate_fire || ~valid_table[allocate_id]), ("%t: *** cache%0d:%0d in-use allocation: addr=%0h, id=%0d", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id))
`RUNTIME_ASSERT((!fill_valid || valid_table[fill_id]), ("%t: *** cache%0d:%0d invalid fill: addr=%0h, id=%0d", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
.DATAW (`MSHR_DATA_WIDTH),
.SIZE (MSHR_SIZE),
.LUTRAM (1)
) entries (
.clk (clk),
.waddr (allocate_id_r),
.raddr (dequeue_id_r),
.wren (allocate_valid),
.wdata (allocate_data),
.rdata (dequeue_data)
);
assign fill_addr = addr_table[fill_id];
assign allocate_ready = allocate_rdy_r;
assign allocate_id = allocate_id_r;
assign dequeue_valid = dequeue_val_r;
assign dequeue_id = dequeue_id_r;
assign dequeue_addr = addr_table[dequeue_id_r];
wire [MSHR_SIZE-1:0] lookup_entries;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_entries[i] = (i != lookup_id);
end
assign lookup_match = |(lookup_entries & valid_table & addr_matches);
`UNUSED_VAR (lookup_valid)
`ifdef DBG_TRACE_CACHE_MSHR
always @(posedge clk) begin
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
if (allocate_fire)
dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id);
if (fill_valid)
dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID));
if (dequeue_fire)
dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id);
if (lookup_replay)
dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lkp_req_id);
if (lookup_valid)
dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b (#%0d)\n", $time, CACHE_ID, BANK_ID,
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id);
if (release_valid)
dpi_trace("%d: cache%0d:%0d mshr-release id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id);
dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID);
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
dpi_trace(" ");
if (ready_table[i])
dpi_trace("*");
dpi_trace("%0d=%0h", i, `LINE_TO_BYTE_ADDR(addr_table[i], BANK_ID));
end
end
dpi_trace("\n");
end
end
`endif
endmodule

View File

@@ -1,323 +0,0 @@
`include "VX_cache_define.vh"
module VX_nc_bypass #(
parameter NUM_PORTS = 1,
parameter NUM_REQS = 1,
parameter NUM_RSP_TAGS = 0,
parameter NC_TAG_BIT = 0,
parameter CORE_ADDR_WIDTH = 1,
parameter CORE_DATA_SIZE = 1,
parameter CORE_TAG_IN_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
parameter CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - 1,
parameter MEM_SELECT_BITS = `UP(`CLOG2(MEM_DATA_SIZE / CORE_DATA_SIZE))
) (
input wire clk,
input wire reset,
// Core request in
input wire [NUM_REQS-1:0] core_req_valid_in,
input wire [NUM_REQS-1:0] core_req_rw_in,
input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in,
input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_req_tag_in,
output wire [NUM_REQS-1:0] core_req_ready_in,
// Core request out
output wire [NUM_REQS-1:0] core_req_valid_out,
output wire [NUM_REQS-1:0] core_req_rw_out,
output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out,
output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_req_tag_out,
input wire [NUM_REQS-1:0] core_req_ready_out,
// Core response in
input wire [NUM_RSP_TAGS-1:0] core_rsp_valid_in,
input wire [NUM_REQS-1:0] core_rsp_tmask_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in,
input wire [NUM_RSP_TAGS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_rsp_tag_in,
output wire [NUM_RSP_TAGS-1:0] core_rsp_ready_in,
// Core response out
output wire [NUM_RSP_TAGS-1:0] core_rsp_valid_out,
output wire [NUM_REQS-1:0] core_rsp_tmask_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out,
output wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out,
input wire [NUM_RSP_TAGS-1:0] core_rsp_ready_out,
// Memory request in
input wire mem_req_valid_in,
input wire mem_req_rw_in,
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [NUM_PORTS-1:0] mem_req_pmask_in,
input wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in,
input wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
// Memory request out
output wire mem_req_valid_out,
output wire mem_req_rw_out,
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [NUM_PORTS-1:0] mem_req_pmask_out,
output wire [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_out,
output wire [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
// Memory response in
input wire mem_rsp_valid_in,
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
// Memory response out
output wire mem_rsp_valid_out,
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
);
`STATIC_ASSERT((NUM_RSP_TAGS == 1 || NUM_RSP_TAGS == NUM_REQS), ("invalid paramter"))
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
localparam CORE_REQ_TIDW = $clog2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + CORE_DATA_SIZE + CORE_ADDR_WIDTH + 1;
localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH);
localparam MEM_LDATAW = $clog2(MEM_DATA_WIDTH);
localparam D = MEM_LDATAW - CORE_LDATAW;
// core request handling
wire [NUM_REQS-1:0] core_req_valid_in_nc;
wire [NUM_REQS-1:0] core_req_nc_tids;
wire [`UP(CORE_REQ_TIDW)-1:0] core_req_nc_tid;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_tids[i] = core_req_tag_in[i][NC_TAG_BIT];
end
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_tids;
VX_priority_encoder #(
.N (NUM_REQS)
) core_req_sel (
.data_in (core_req_valid_in_nc),
.index (core_req_nc_tid),
.onehot (core_req_nc_sel),
.valid_out (core_req_nc_valid)
);
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_tids;
assign core_req_rw_out = core_req_rw_in;
assign core_req_addr_out = core_req_addr_in;
assign core_req_byteen_out = core_req_byteen_in;
assign core_req_data_out = core_req_data_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_remove #(
.N (CORE_TAG_IN_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) core_req_tag_remove (
.data_in (core_req_tag_in[i]),
.data_out (core_req_tag_out[i])
);
end
if (NUM_REQS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ?
(~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i]) : core_req_ready_out[i];
end
end else begin
assign core_req_ready_in = core_req_valid_in_nc ? (~mem_req_valid_in && mem_req_ready_out) : core_req_ready_out;
end
// memory request handling
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
assign mem_req_ready_in = mem_req_ready_out;
wire [(MEM_TAG_IN_WIDTH+1)-1:0] mem_req_tag_in_c;
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_req_tag_in),
.sel_in ('0),
.data_out (mem_req_tag_in_c)
);
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]};
end
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_tid];
end else begin
assign core_req_tag_in_sel = core_req_tag_in;
assign core_req_data_in_sel = core_req_data_in;
assign core_req_byteen_in_sel = core_req_byteen_in;
assign core_req_addr_in_sel = core_req_addr_in;
assign core_req_rw_in_sel = core_req_rw_in;
end
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[D +: MEM_ADDR_WIDTH];
if (D != 0) begin
reg [NUM_PORTS-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r;
reg [NUM_PORTS-1:0][MEM_SELECT_BITS-1:0] mem_req_wsel_in_r;
reg [NUM_PORTS-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [D-1:0] req_addr_idx = core_req_addr_in_sel[D-1:0];
always @(*) begin
mem_req_byteen_in_r = 0;
mem_req_byteen_in_r[0] = core_req_byteen_in_sel;
mem_req_wsel_in_r = 'x;
mem_req_wsel_in_r[0] = req_addr_idx;
mem_req_data_in_r = 'x;
mem_req_data_in_r[0] = core_req_data_in_sel;
end
assign mem_req_pmask_out = mem_req_valid_in ? mem_req_pmask_in : NUM_PORTS'(1'b1);
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_wsel_out = mem_req_valid_in ? mem_req_wsel_in : mem_req_wsel_in_r;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in_sel});
end else begin
`UNUSED_VAR (mem_req_wsel_in)
`UNUSED_VAR (mem_req_pmask_in)
assign mem_req_pmask_out = 0;
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel;
assign mem_req_wsel_out = 0;
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_c) : MEM_TAG_OUT_WIDTH'({core_req_nc_tid, core_req_tag_in_sel});
end
// core response handling
wire [NUM_RSP_TAGS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out_c;
wire is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT];
for (genvar i = 0; i < NUM_RSP_TAGS; ++i) begin
VX_bits_insert #(
.N (CORE_TAG_OUT_WIDTH),
.S (1),
.POS (NC_TAG_BIT)
) core_rsp_tag_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_out_c[i])
);
end
if (NUM_RSP_TAGS > 1) begin
wire [CORE_REQ_TIDW-1:0] rsp_tid = mem_rsp_tag_in[(CORE_TAG_IN_WIDTH + D) +: CORE_REQ_TIDW];
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = 0;
rsp_nc_valid_r[rsp_tid] = is_mem_rsp_nc;
end
assign core_rsp_valid_out = core_rsp_valid_in | rsp_nc_valid_r;
assign core_rsp_tmask_out = core_rsp_tmask_in;
assign core_rsp_ready_in = core_rsp_ready_out;
if (D != 0) begin
wire [D-1:0] rsp_addr_idx = mem_rsp_tag_in[CORE_TAG_IN_WIDTH +: D];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ?
core_rsp_data_in[i] : mem_rsp_data_in[rsp_addr_idx * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_out_c[i] : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
end
end else begin
assign core_rsp_valid_out = core_rsp_valid_in || is_mem_rsp_nc;
assign core_rsp_tag_out = core_rsp_valid_in ? core_rsp_tag_out_c : mem_rsp_tag_in[CORE_TAG_IN_WIDTH-1:0];
assign core_rsp_ready_in = core_rsp_ready_out;
if (NUM_REQS > 1) begin
wire [CORE_REQ_TIDW-1:0] rsp_tid = mem_rsp_tag_in[(CORE_TAG_IN_WIDTH + D) +: CORE_REQ_TIDW];
reg [NUM_REQS-1:0] core_rsp_tmask_in_r;
always @(*) begin
core_rsp_tmask_in_r = 0;
core_rsp_tmask_in_r[rsp_tid] = 1;
end
assign core_rsp_tmask_out = core_rsp_valid_in ? core_rsp_tmask_in : core_rsp_tmask_in_r;
end else begin
assign core_rsp_tmask_out = core_rsp_tmask_in || is_mem_rsp_nc;
end
if (D != 0) begin
wire [D-1:0] rsp_addr_idx = mem_rsp_tag_in[CORE_TAG_IN_WIDTH +: D];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in ?
core_rsp_data_in[i] : mem_rsp_data_in[rsp_addr_idx * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in ? core_rsp_data_in[i] : mem_rsp_data_in;
end
end
end
// memory response handling
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT];
assign mem_rsp_data_out = mem_rsp_data_in;
VX_bits_remove #(
.N (MEM_TAG_IN_WIDTH+1),
.S (1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH+1)-1:0]),
.data_out (mem_rsp_tag_out)
);
if (NUM_RSP_TAGS > 1) begin
wire [CORE_REQ_TIDW-1:0] rsp_tid = mem_rsp_tag_in[(CORE_TAG_IN_WIDTH + D) +: CORE_REQ_TIDW];
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in[rsp_tid] && core_rsp_ready_out[rsp_tid]) : mem_rsp_ready_out;
end else begin
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in && core_rsp_ready_out) : mem_rsp_ready_out;
end
endmodule

View File

@@ -1,371 +0,0 @@
`include "VX_cache_define.vh"
module VX_shared_mem #(
parameter CACHE_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = (1024*16),
// Number of banks
parameter NUM_BANKS = 2,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Core Request Queue Size
parameter CREQ_SIZE = 2,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 8,
// core request tag size
parameter CORE_TAG_WIDTH = (2 + CORE_TAG_ID_BITS),
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = `CLOG2(256)
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if.master perf_cache_if,
`endif
// Core request
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
// Core response
output wire core_rsp_valid,
output wire [NUM_REQS-1:0] core_rsp_tmask,
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (CORE_TAG_ID_BITS)
localparam CACHE_LINE_SIZE = WORD_SIZE;
wire [NUM_BANKS-1:0] per_bank_core_req_valid_unqual;
wire [NUM_BANKS-1:0] per_bank_core_req_rw_unqual;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_unqual;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_unqual;
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_unqual;
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_unqual;
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_unqual;
wire [NUM_BANKS-1:0] per_bank_core_req_ready_unqual;
VX_core_req_bank_sel #(
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (WORD_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (1),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH),
.BANK_ADDR_OFFSET(BANK_ADDR_OFFSET)
) core_req_bank_sel (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.bank_stalls(perf_cache_if.bank_stalls),
`endif
.core_req_valid (core_req_valid),
.core_req_rw (core_req_rw),
.core_req_addr (core_req_addr),
.core_req_byteen (core_req_byteen),
.core_req_data (core_req_data),
.core_req_tag (core_req_tag),
.core_req_ready (core_req_ready),
.per_bank_core_req_valid (per_bank_core_req_valid_unqual),
.per_bank_core_req_tid (per_bank_core_req_tid_unqual),
.per_bank_core_req_rw (per_bank_core_req_rw_unqual),
.per_bank_core_req_addr (per_bank_core_req_addr_unqual),
.per_bank_core_req_byteen(per_bank_core_req_byteen_unqual),
.per_bank_core_req_tag (per_bank_core_req_tag_unqual),
.per_bank_core_req_data (per_bank_core_req_data_unqual),
.per_bank_core_req_ready (per_bank_core_req_ready_unqual),
`UNUSED_PIN (per_bank_core_req_pmask),
`UNUSED_PIN (per_bank_core_req_wsel)
);
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid;
wire creq_out_valid, creq_out_ready;
wire creq_in_valid, creq_in_ready;
wire creq_in_fire = creq_in_valid && creq_in_ready;
`UNUSED_VAR (creq_in_fire)
wire creq_out_fire = creq_out_valid && creq_out_ready;
`UNUSED_VAR (creq_out_fire)
assign creq_in_valid = (| core_req_valid);
assign per_bank_core_req_ready_unqual = {NUM_BANKS{creq_in_ready}};
wire [NUM_BANKS-1:0] core_req_read_mask, core_req_read_mask_unqual;
wire core_req_writeonly, core_req_writeonly_unqual;
assign core_req_read_mask_unqual = per_bank_core_req_valid_unqual & ~per_bank_core_req_rw_unqual;
assign core_req_writeonly_unqual = ~(| core_req_read_mask_unqual);
VX_elastic_buffer #(
.DATAW (NUM_BANKS * (1 + 1 + `LINE_ADDR_WIDTH + WORD_SIZE + `WORD_WIDTH + CORE_TAG_WIDTH + `REQS_BITS) + NUM_BANKS + 1),
.SIZE (CREQ_SIZE),
.OUT_REG (1) // output should be registered for the data_store addr port
) core_req_queue (
.clk (clk),
.reset (reset),
.ready_in (creq_in_ready),
.valid_in (creq_in_valid),
.data_in ({per_bank_core_req_valid_unqual,
per_bank_core_req_rw_unqual,
per_bank_core_req_addr_unqual,
per_bank_core_req_byteen_unqual,
per_bank_core_req_data_unqual,
per_bank_core_req_tag_unqual,
per_bank_core_req_tid_unqual,
core_req_read_mask_unqual,
core_req_writeonly_unqual}),
.data_out ({per_bank_core_req_valid,
per_bank_core_req_rw,
per_bank_core_req_addr,
per_bank_core_req_byteen,
per_bank_core_req_data,
per_bank_core_req_tag,
per_bank_core_req_tid,
core_req_read_mask,
core_req_writeonly}),
.ready_out (creq_out_ready),
.valid_out (creq_out_valid)
);
wire crsq_in_valid, crsq_in_ready;
wire crsq_last_read;
assign creq_out_ready = core_req_writeonly
|| (crsq_in_ready && crsq_last_read);
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
for (genvar i = 0; i < NUM_BANKS; i++) begin
wire [WORD_SIZE-1:0] wren = per_bank_core_req_byteen[i]
& {WORD_SIZE{per_bank_core_req_valid[i]
&& per_bank_core_req_rw[i]}};
wire [`LINE_SELECT_BITS-1:0] addr = per_bank_core_req_addr[i][`LINE_SELECT_BITS-1:0];
VX_sp_ram #(
.DATAW (`WORD_WIDTH),
.SIZE (`LINES_PER_BANK),
.BYTEENW (WORD_SIZE),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.addr (addr),
.wren (wren),
.wdata (per_bank_core_req_data[i]),
.rdata (per_bank_core_rsp_data[i])
);
end
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
reg [NUM_REQS-1:0] core_rsp_valids_in;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in;
wire [CORE_TAG_WIDTH-1:0] core_rsp_tag_in;
reg [NUM_BANKS-1:0] bank_rsp_sel_r, bank_rsp_sel_n;
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
assign crsq_last_read = (bank_rsp_sel_n == core_req_read_mask);
always @(posedge clk) begin
if (reset) begin
bank_rsp_sel_r <= 0;
end else begin
if (crsq_in_fire) begin
if (crsq_last_read) begin
bank_rsp_sel_r <= 0;
end else begin
bank_rsp_sel_r <= bank_rsp_sel_n;
end
end
end
end
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first (
.valid_i (core_req_read_mask & ~bank_rsp_sel_r),
.data_i (per_bank_core_req_tag),
.data_o (core_rsp_tag_in),
`UNUSED_PIN (valid_o)
);
always @(*) begin
core_rsp_valids_in = 0;
core_rsp_data_in = 'x;
bank_rsp_sel_n = bank_rsp_sel_r;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (core_req_read_mask[i]
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
bank_rsp_sel_n[i] = 1;
end
end
end
assign crsq_in_valid = creq_out_valid && ~core_req_writeonly;
VX_elastic_buffer #(
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH),
.SIZE (CRSQ_SIZE)
) core_rsp_req (
.clk (clk),
.reset (reset),
.valid_in (crsq_in_valid),
.data_in ({core_rsp_valids_in, core_rsp_data_in, core_rsp_tag_in}),
.ready_in (crsq_in_ready),
.valid_out (core_rsp_valid),
.data_out ({core_rsp_tmask, core_rsp_data, core_rsp_tag}),
.ready_out (core_rsp_ready)
);
`IGNORE_UNUSED_BEGIN
wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1;
`IGNORE_UNUSED_END
for (genvar i = 0; i < NUM_BANKS; ++i) begin
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG];
assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG];
end else begin
assign req_id_st0[i] = 0;
assign req_id_st1[i] = 0;
end
end
`ifdef DBG_TRACE_CACHE_BANK
reg is_multi_tag_req;
`IGNORE_UNUSED_BEGIN
reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel;
`IGNORE_UNUSED_END
VX_find_first #(
.N (NUM_BANKS),
.DATAW (CORE_TAG_WIDTH)
) find_first_d (
.valid_i (per_bank_core_req_valid),
.data_i (per_bank_core_req_tag),
.data_o (core_req_tag_sel),
`UNUSED_PIN (valid_o)
);
always @(*) begin
is_multi_tag_req = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]
&& (core_req_tag_sel[CORE_TAG_ID_BITS-1:0] != per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
is_multi_tag_req = creq_out_valid;
end
end
end
always @(posedge clk) begin
if (!crsq_in_ready) begin
dpi_trace("%d: *** cache%0d pipeline-stall\n", $time, CACHE_ID);
end
if (is_multi_tag_req) begin
dpi_trace("%d: *** cache%0d multi-tag request!\n", $time, CACHE_ID);
end
if (creq_in_fire) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_unqual[i]) begin
if (per_bank_core_req_rw_unqual[i]) begin
dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h (#%0d)\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]);
end else begin
dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h (#%0d)\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]);
end
end
end
end
if (creq_out_fire) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]) begin
if (per_bank_core_req_rw[i]) begin
dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]);
end else begin
dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n",
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]);
end
end
end
end
end
`endif
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready;
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= 0;
perf_core_writes <= 0;
perf_crsp_stalls <= 0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign perf_cache_if.reads = perf_core_reads;
assign perf_cache_if.writes = perf_core_writes;
assign perf_cache_if.read_misses = '0;
assign perf_cache_if.write_misses = '0;
assign perf_cache_if.mshr_stalls = '0;
assign perf_cache_if.mem_stalls = '0;
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
`endif
endmodule

View File

@@ -1,79 +0,0 @@
`include "VX_cache_define.vh"
module VX_tag_access #(
parameter CACHE_ID = 0,
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 1,
// Number of banks
parameter NUM_BANKS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
`IGNORE_UNUSED_END
input wire stall,
// read/fill
input wire lookup,
input wire[`LINE_ADDR_WIDTH-1:0] addr,
input wire fill,
input wire flush,
output wire tag_match
);
`UNUSED_PARAM (CACHE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
wire [`TAG_SELECT_BITS-1:0] read_tag;
wire read_valid;
wire [`LINE_SELECT_BITS-1:0] line_addr = addr[`LINE_SELECT_BITS-1:0];
wire [`TAG_SELECT_BITS-1:0] line_tag = `LINE_TAG_ADDR(addr);
VX_sp_ram #(
.DATAW (`TAG_SELECT_BITS + 1),
.SIZE (`LINES_PER_BANK),
.NO_RWCHECK (1)
) tag_store (
.clk( clk),
.addr (line_addr),
.wren (fill || flush),
.wdata ({!flush, line_tag}),
.rdata ({read_valid, read_tag})
);
assign tag_match = read_valid && (line_tag == read_tag);
`UNUSED_VAR (stall)
`ifdef DBG_TRACE_CACHE_TAG
always @(posedge clk) begin
if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag);
end
if (flush) begin
dpi_trace("%d: cache%0d:%0d tag-flush: addr=%0h, blk_addr=%0d\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr);
end
if (lookup && ~stall) begin
if (tag_match) begin
dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, blk_addr=%0d, tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, req_id);
end else begin
dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, read_tag, req_id);
end
end
end
`endif
endmodule

172
hw/rtl/core/VX_alu_unit.sv Normal file
View File

@@ -0,0 +1,172 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire is_muldiv_op;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) int_commit_if();
`RESET_RELAY (int_reset, reset);
VX_int_unit #(
.CORE_ID (CORE_ID),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) int_unit (
.clk (clk),
.reset (int_reset),
.execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
);
`ifdef EXT_M_ENABLE
assign is_muldiv_op = `INST_ALU_IS_M(execute_if[block_idx].data.op_mod);
`RESET_RELAY (mdv_reset, reset);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) mdv_commit_if();
VX_muldiv_unit #(
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) mdv_unit (
.clk (clk),
.reset (mdv_reset),
.execute_if (mdv_execute_if),
.commit_if (mdv_commit_if)
);
assign execute_if[block_idx].ready = is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready;
`else
assign is_muldiv_op = 0;
assign execute_if[block_idx].ready = int_execute_if.ready;
`endif
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (PARTIAL_BW ? 1 : 3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (commit_block_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.commit_out_if (commit_if)
);
endmodule

226
hw/rtl/core/VX_commit.sv Normal file
View File

@@ -0,0 +1,226 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if [`ISSUE_WIDTH],
VX_commit_if.slave lsu_commit_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
// outputs
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if,
// simulation helper signals
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
// commit arbitration
VX_commit_if commit_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) commit_arb (
.clk (clk),
.reset (arb_reset),
.valid_in ({
sfu_commit_if[i].valid,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].valid,
`endif
alu_commit_if[i].valid,
lsu_commit_if[i].valid
}),
.ready_in ({
sfu_commit_if[i].ready,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].ready,
`endif
alu_commit_if[i].ready,
lsu_commit_if[i].ready
}),
.data_in ({
sfu_commit_if[i].data,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].data,
`endif
alu_commit_if[i].data,
lsu_commit_if[i].data
}),
.data_out (commit_if[i].data),
.valid_out (commit_if[i].valid),
.ready_out (commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
end
// CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count;
`POP_COUNT(pop_count, commit_tmask[i]);
assign commit_size[i] = pop_count;
end
VX_pipe_register #(
.DATAW (1 + `ISSUE_WIDTH * COMMIT_SIZEW),
.RESETW (1)
) commit_size_reg1 (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any, commit_size}),
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),
.OP ("+")
) commit_size_reduce (
.data_in (commit_size_r),
.data_out (commit_size_all)
);
VX_pipe_register #(
.DATAW (1 + COMMIT_ALL_SIZEW),
.RESETW (1)
) commit_size_reg2 (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}),
.data_out ({commit_fire_any_rr, commit_size_all_r})
);
reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin
if (reset) begin
instret <= '0;
end else begin
if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
end
end
end
assign commit_csr_if.instret = instret;
// Committed instructions
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1;
end
// simulation helper signal to get RISC-V tests Pass/Fail status
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
always @(posedge clk) begin
if (writeback_if[0].valid) begin
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
end
end
assign sim_wb_value = sim_wb_value_r;
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", alu_commit_if[i].data.uuid));
end
if (lsu_commit_if[i].valid && lsu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, lsu_commit_if[i].data.wid, lsu_commit_if[i].data.PC, lsu_commit_if[i].data.tmask, lsu_commit_if[i].data.wb, lsu_commit_if[i].data.rd, lsu_commit_if[i].data.sop, lsu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, lsu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", lsu_commit_if[i].data.uuid));
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if[i].valid && fpu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, fpu_commit_if[i].data.wid, fpu_commit_if[i].data.PC, fpu_commit_if[i].data.tmask, fpu_commit_if[i].data.wb, fpu_commit_if[i].data.rd, fpu_commit_if[i].data.sop, fpu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, fpu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", fpu_commit_if[i].data.uuid));
end
`endif
if (sfu_commit_if[i].valid && sfu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=SFU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, sfu_commit_if[i].data.wid, sfu_commit_if[i].data.PC, sfu_commit_if[i].data.tmask, sfu_commit_if[i].data.wb, sfu_commit_if[i].data.rd, sfu_commit_if[i].data.sop, sfu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, sfu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", sfu_commit_if[i].data.uuid));
end
end
end
`endif
endmodule

469
hw/rtl/core/VX_core.sv Normal file
View File

@@ -0,0 +1,469 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
VX_schedule_if schedule_if();
VX_fetch_if fetch_if();
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.schedule_if (schedule_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sched_csr_if (sched_csr_if),
.busy (busy)
);
VX_fetch #(
.CORE_ID (CORE_ID)
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
VX_execute #(
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
);
VX_commit #(
.CORE_ID (CORE_ID)
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if),
.sim_wb_value (sim_wb_value)
);
VX_smem_unit #(
.CORE_ID (CORE_ID)
) smem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_in_if (mem_perf_if),
.mem_perf_out_if (mem_perf_tmp_if),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if)
);
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_icache_pending_reads <= '0;
perf_dcache_pending_reads <= '0;
end else begin
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
always @(posedge clk) begin
if (reset) begin
perf_ifetches <= '0;
perf_loads <= '0;
perf_stores <= '0;
perf_icache_lat <= '0;
perf_dcache_lat <= '0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif
endmodule
///////////////////////////////////////////////////////////////////////////////
module VX_core_top
import VX_gpu_pkg::*;
#(
parameter CORE_ID = 0
) (
// Clock
input wire clk,
input wire reset,
input wire dcr_write_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_write_addr,
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_write_data,
output wire [DCACHE_NUM_REQS-1:0] dcache_req_valid,
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
output wire icache_req_valid,
output wire icache_req_rw,
output wire [ICACHE_WORD_SIZE-1:0] icache_req_byteen,
output wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr,
output wire [ICACHE_WORD_SIZE*8-1:0] icache_req_data,
output wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag,
input wire icache_req_ready,
input wire icache_rsp_valid,
input wire [ICACHE_WORD_SIZE*8-1:0] icache_rsp_data,
input wire [ICACHE_TAG_WIDTH-1:0] icache_rsp_tag,
output wire icache_rsp_ready,
`ifdef GBAR_ENABLE
output wire gbar_req_valid,
output wire [`NB_WIDTH-1:0] gbar_req_id,
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
output wire [`NC_WIDTH-1:0] gbar_req_core_id,
input wire gbar_req_ready,
input wire gbar_rsp_valid,
input wire [`NB_WIDTH-1:0] gbar_rsp_id,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef GBAR_ENABLE
VX_gbar_bus_if gbar_bus_if();
assign gbar_req_valid = gbar_bus_if.req_valid;
assign gbar_req_id = gbar_bus_if.req_id;
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
assign gbar_req_core_id = gbar_bus_if.req_core_id;
assign gbar_bus_if.req_ready = gbar_req_ready;
assign gbar_bus_if.rsp_valid = gbar_rsp_valid;
assign gbar_bus_if.rsp_id = gbar_rsp_id;
`endif
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_write_valid;
assign dcr_bus_if.write_addr = dcr_write_addr;
assign dcr_bus_if.write_data = dcr_write_data;
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) dcache_bus_if[DCACHE_NUM_REQS]();
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign dcache_req_valid[i] = dcache_bus_if[i].req_valid;
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
assign dcache_bus_if[i].rsp_valid = dcache_rsp_valid[i];
assign dcache_bus_if[i].rsp_data.tag = dcache_rsp_tag[i];
assign dcache_bus_if[i].rsp_data.data = dcache_rsp_data[i];
assign dcache_rsp_ready[i] = dcache_bus_if[i].rsp_ready;
end
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) icache_bus_if();
assign icache_req_valid = icache_bus_if.req_valid;
assign icache_req_rw = icache_bus_if.req_data.rw;
assign icache_req_byteen = icache_bus_if.req_data.byteen;
assign icache_req_addr = icache_bus_if.req_data.addr;
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
assign icache_bus_if.rsp_data.data = icache_rsp_data;
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_core #(
.CORE_ID (0)
) core (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (dcr_bus_if),
.dcache_bus_if (dcache_bus_if),
.icache_bus_if (icache_bus_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
.busy (busy)
);
endmodule

304
hw/rtl/core/VX_csr_data.sv Normal file
View File

@@ -0,0 +1,304 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
import VX_fpu_pkg::*;
`endif
#(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
VX_commit_csr_if.slave commit_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
`endif
input wire [`PERF_CTR_BITS-1:0] cycles,
input wire [`NUM_WARPS-1:0] active_warps,
input wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks,
input wire read_enable,
input wire [`UUID_WIDTH-1:0] read_uuid,
input wire [`NW_WIDTH-1:0] read_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
output wire [31:0] read_data_ro,
output wire [31:0] read_data_rw,
input wire write_enable,
input wire [`UUID_WIDTH-1:0] write_uuid,
input wire [`NW_WIDTH-1:0] write_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
input wire [31:0] write_data
);
`UNUSED_VAR (reset)
`UNUSED_VAR (write_wid)
`UNUSED_VAR (write_data)
// CSRs Write /////////////////////////////////////////////////////////////
`ifdef EXT_F_ENABLE
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_write_enable[i] = fpu_to_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_to_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_to_csr_if[i].write_fflags;
end
always @(*) begin
fcsr_n = fcsr;
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
if (fpu_write_enable[i]) begin
fcsr_n[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0] = fcsr[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0]
| fpu_write_fflags[i];
end
end
if (write_enable) begin
case (write_addr)
`VX_CSR_FFLAGS: fcsr_n[write_wid][`FP_FLAGS_BITS-1:0] = write_data[`FP_FLAGS_BITS-1:0];
`VX_CSR_FRM: fcsr_n[write_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS] = write_data[`INST_FRM_BITS-1:0];
`VX_CSR_FCSR: fcsr_n[write_wid] = write_data[`FP_FLAGS_BITS+`INST_FRM_BITS-1:0];
default:;
endcase
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_to_csr_if[i].read_frm = fcsr[fpu_to_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
always @(posedge clk) begin
if (reset) begin
fcsr <= '0;
end else begin
fcsr <= fcsr_n;
end
end
`endif
always @(posedge clk) begin
if (write_enable) begin
case (write_addr)
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS,
`VX_CSR_FRM,
`VX_CSR_FCSR,
`endif
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
`VX_CSR_MEDELEG,
`VX_CSR_MIDELEG,
`VX_CSR_MIE,
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0: /* do nothing!*/;
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
end
endcase
end
end
// CSRs read //////////////////////////////////////////////////////////////
reg [31:0] read_data_ro_r;
reg [31:0] read_data_rw_r;
reg read_addr_valid_r;
always @(*) begin
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = 32'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = 32'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = 32'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = (((`CLOG2(`XLEN)-4) << (`XLEN-2)) | `MISA_STD);
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = 32'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = 32'(fcsr[read_wid]);
`endif
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = 32'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = 32'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_MCYCLE : read_data_ro_r = 32'(cycles[31:0]);
`VX_CSR_MCYCLE_H : read_data_ro_r = 32'(cycles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MINSTRET : read_data_ro_r = 32'(commit_csr_if.instret[31:0]);
`VX_CSR_MINSTRET_H : read_data_ro_r = 32'(commit_csr_if.instret[`PERF_CTR_BITS-1:32]);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
`VX_CSR_MEDELEG,
`VX_CSR_MIDELEG,
`VX_CSR_MIE,
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = 32'(0);
default: begin
read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
read_addr_valid_r = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache_reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache_read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache_reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache_writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache_writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache_read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache_write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache_bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache_mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem_reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem_writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem_writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem_bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache_reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache_writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache_writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache_read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache_read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache_write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache_write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache_bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache_mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache_reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache_writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache_writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache_read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache_read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache_write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache_write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache_bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache_mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem_reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem_reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem_writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem_writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem_latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
default:;
endcase
`endif
end
end
endcase
end
assign read_data_ro = read_data_ro_r;
assign read_data_rw = read_data_rw_r;
`UNUSED_VAR (base_dcrs)
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`endif
endmodule

181
hw/rtl/core/VX_csr_unit.sv Normal file
View File

@@ -0,0 +1,181 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
`endif
VX_commit_csr_if.slave commit_csr_if,
VX_sched_csr_if.slave sched_csr_if,
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
reg [NUM_LANES-1:0][31:0] csr_read_data;
reg [31:0] csr_write_data;
wire [31:0] csr_read_data_ro, csr_read_data_rw;
wire [31:0] csr_req_data;
reg csr_rd_enable;
wire csr_wr_enable;
wire csr_req_ready;
// wait for all pending instructions to complete
assign sched_csr_if.alm_empty_wid = execute_if.data.wid;
wire no_pending_instr = sched_csr_if.alm_empty;
wire csr_req_valid = execute_if.valid && no_pending_instr;
assign execute_if.ready = csr_req_ready && no_pending_instr;
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.imm[`VX_CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire [NUM_LANES-1:0][31:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rs1_data[i] = execute_if.data.rs1_data[i][31:0];
end
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #(
.CORE_ID (CORE_ID)
) csr_data (
.clk (clk),
.reset (reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif
.commit_csr_if (commit_csr_if),
.cycles (sched_csr_if.cycles),
.active_warps (sched_csr_if.active_warps),
.thread_masks (sched_csr_if.thread_masks),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.read_enable (csr_req_valid && csr_rd_enable),
.read_uuid (execute_if.data.uuid),
.read_wid (execute_if.data.wid),
.read_addr (csr_addr),
.read_data_ro (csr_read_data_ro),
.read_data_rw (csr_read_data_rw),
.write_enable (csr_req_valid && csr_wr_enable),
.write_uuid (execute_if.data.uuid),
.write_wid (execute_if.data.wid),
.write_addr (csr_addr),
.write_data (csr_write_data)
);
// CSR read
wire [NUM_LANES-1:0][31:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
assign wtid[i] = 32'(execute_if.data.pid * NUM_LANES + i);
end else begin
assign wtid[i] = 32'(i);
end
assign gtid[i] = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (32'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end
always @(*) begin
csr_rd_enable = 0;
case (csr_addr)
`VX_CSR_THREAD_ID : csr_read_data = wtid;
`VX_CSR_MHARTID : csr_read_data = gtid;
default : begin
csr_read_data = {NUM_LANES{csr_read_data_ro | csr_read_data_rw}};
csr_rd_enable = 1;
end
endcase
end
// CSR write
assign csr_req_data = execute_if.data.use_imm ? 32'(csr_imm) : rs1_data[0];
assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
always @(*) begin
case (execute_if.data.op_type)
`INST_SFU_CSRRW: begin
csr_write_data = csr_req_data;
end
`INST_SFU_CSRRS: begin
csr_write_data = csr_read_data_rw | csr_req_data;
end
//`INST_SFU_CSRRC
default: begin
csr_write_data = csr_read_data_rw & ~csr_req_data;
end
endcase
end
// unlock the warp
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
assign sched_csr_if.unlock_wid = execute_if.data.wid;
// send response
wire [NUM_LANES-1:0][31:0] csr_commit_data;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(csr_commit_data[i]);
end
endmodule

View File

@@ -0,0 +1,57 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
input wire clk,
input wire reset,
// Inputs
VX_dcr_bus_if.slave dcr_bus_if,
// Outputs
output base_dcrs_t base_dcrs
);
`UNUSED_VAR (reset)
base_dcrs_t dcrs;
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
case (dcr_bus_if.write_addr)
`VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0];
default:;
endcase
end
end
assign base_dcrs = dcrs;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data));
end
end
`endif
endmodule

552
hw/rtl/core/VX_decode.sv Normal file
View File

@@ -0,0 +1,552 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_r = {1'b0, ``x}; \
use_``x = 1
`define USED_FREG(x) \
x``_r = {1'b1, ``x}; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_r = ``x; \
use_``x = 1
`endif
module VX_decode #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_fetch_if.slave fetch_if,
// outputs
VX_decode_if.master decode_if,
VX_decode_sched_if.master decode_sched_if
);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg is_wstall;
wire [31:0] instr = fetch_if.data.instr;
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [4:0] func5 = instr[31:27];
wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20];
wire [4:0] rd = instr[11:7];
wire [4:0] rs1 = instr[19:15];
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
`UNUSED_VAR (func2)
`UNUSED_VAR (func5)
`UNUSED_VAR (rs3)
`UNUSED_VAR (use_rd)
`UNUSED_VAR (use_rs1)
`UNUSED_VAR (use_rs2)
`UNUSED_VAR (use_rs3)
wire is_itype_sh = func3[0] && ~func3[1];
wire [19:0] ui_imm = instr[31:12];
`ifdef XLEN_64
wire [11:0] i_imm = is_itype_sh ? {6'b0, instr[25:20]} : u_12;
wire [11:0] iw_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`else
wire [11:0] i_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`endif
wire [11:0] s_imm = {func7, rd};
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
reg [`INST_ALU_BITS-1:0] r_type;
always @(*) begin
case (func3)
3'h0: r_type = (opcode[5] && func7[5]) ? `INST_ALU_SUB : `INST_ALU_ADD;
3'h1: r_type = `INST_ALU_SLL;
3'h2: r_type = `INST_ALU_SLT;
3'h3: r_type = `INST_ALU_SLTU;
3'h4: r_type = `INST_ALU_XOR;
3'h5: r_type = func7[5] ? `INST_ALU_SRA : `INST_ALU_SRL;
3'h6: r_type = `INST_ALU_OR;
3'h7: r_type = `INST_ALU_AND;
endcase
end
reg [`INST_BR_BITS-1:0] b_type;
always @(*) begin
case (func3)
3'h0: b_type = `INST_BR_EQ;
3'h1: b_type = `INST_BR_NE;
3'h4: b_type = `INST_BR_LT;
3'h5: b_type = `INST_BR_GE;
3'h6: b_type = `INST_BR_LTU;
3'h7: b_type = `INST_BR_GEU;
default: b_type = 'x;
endcase
end
reg [`INST_BR_BITS-1:0] s_type;
always @(*) begin
case (u_12)
12'h000: s_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: s_type = `INST_OP_BITS'(`INST_BR_MRET);
default: s_type = 'x;
endcase
end
`ifdef EXT_M_ENABLE
reg [`INST_M_BITS-1:0] m_type;
always @(*) begin
case (func3)
3'h0: m_type = `INST_M_MUL;
3'h1: m_type = `INST_M_MULH;
3'h2: m_type = `INST_M_MULHSU;
3'h3: m_type = `INST_M_MULHU;
3'h4: m_type = `INST_M_DIV;
3'h5: m_type = `INST_M_DIVU;
3'h6: m_type = `INST_M_REM;
3'h7: m_type = `INST_M_REMU;
endcase
end
`endif
always @(*) begin
ex_type = '0;
op_type = 'x;
op_mod = '0;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
use_rs3 = 0;
is_wstall = 0;
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){i_imm[11]}}, i_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
op_type = `INST_OP_BITS'(r_type);
end
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`ifdef XLEN_64
`INST_I_W: begin
// ADDIW, SLLIW, SRLIW, SRAIW
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
op_mod[2] = 1;
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){iw_imm[11]}}, iw_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R_W: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
end
op_mod[2] = 1;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`endif
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
use_rd = 1;
use_imm = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
use_imm = 1;
use_PC = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{(`XLEN-21){jal_imm[20]}}, jal_imm};
`USED_IREG (rd);
end
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
is_wstall = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_B: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(b_type);
op_mod[0] = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{(`XLEN-13){b_imm[12]}}, b_imm};
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`INST_FENCE: begin
ex_type = `EX_LSU;
op_type = `INST_LSU_FENCE;
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
use_rd = 1;
is_wstall = 1;
use_imm = func3[2];
imm[`VX_CSR_ADDR_BITS-1:0] = u_12; // addr
`USED_IREG (rd);
if (func3[2]) begin
imm[`VX_CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
end else begin
`USED_IREG (rs1);
end
end else begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(s_type);
op_mod[0] = 1;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = `XLEN'd4;
`USED_IREG (rd);
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`endif
`INST_L: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3});
use_rd = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
use_imm = 1;
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rd);
end else
`endif
`USED_IREG (rd);
`USED_IREG (rs1);
end
`ifdef EXT_F_ENABLE
`INST_FS,
`endif
`INST_S: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{(`XLEN-12){s_imm[11]}}, s_imm};
use_imm = 1;
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rs2);
end else
`endif
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_mod = `INST_MOD_BITS'(func3);
imm[0] = func2[0]; // destination is double?
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
`USED_FREG (rs3);
end
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = `INST_MOD_BITS'(func3);
`ifdef FLEN_64
imm[0] = func2[0]; // destination is double?
`endif
use_rd = 1;
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
5'b00010, // FMUL
5'b00011: begin // FDIV
op_type = `INST_OP_BITS'(func5[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b00100: begin
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = `INST_MOD_BITS'(func3[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b00101: begin
// NCP: FMIN=6, FMAX=7
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 7 : 6;
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
`ifdef FLEN_64
5'b01000: begin
// CVT.S.D, CVT.D.S
op_type = `INST_OP_BITS'(`INST_FPU_F2F);
`USED_FREG (rd);
`USED_FREG (rs1);
end
`endif
5'b01011: begin
// SQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
5'b10100: begin
// CMP
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b11000: begin
// CVT.W.X, CVT.WU.X
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11010: begin
// CVT.X.W, CVT.X.WU
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_FREG (rd);
`USED_IREG (rs1);
end
5'b11100: begin
if (func3[0]) begin
// NCP: FCLASS=3
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 3;
end else begin
// NCP: FMV.X.W=4
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 4;
end
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11110: begin
// NCP: FMV.W.X=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
`USED_FREG (rd);
`USED_IREG (rs1);
end
default:;
endcase
end
`endif
`INST_EXT1: begin
case (func7)
7'h00: begin
ex_type = `EX_SFU;
is_wstall = 1;
case (func3)
3'h0: begin // TMC
op_type = `INST_OP_BITS'(`INST_SFU_TMC);
`USED_IREG (rs1);
end
3'h1: begin // WSPAWN
op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
use_rd = 1;
`USED_IREG (rs1);
`USED_IREG (rd);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_SFU_JOIN);
`USED_IREG (rs1);
end
3'h4: begin // BAR
op_type = `INST_OP_BITS'(`INST_SFU_BAR);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h5: begin // PRED
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
default:;
endcase
end
default:;
endcase
end
`INST_EXT2: begin
case (func3)
3'h1: begin
case (func2)
2'h0: begin // CMOV
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CMOV);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
default:;
endcase
end
default:;
endcase
end
default:;
endcase
end
// disable write to integer register r0
wire wb = use_rd && (rd_r != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (0)
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
);
///////////////////////////////////////////////////////////////////////////
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, decode_if.data.PC, instr));
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.rd, decode_if.data.rs2, decode_if.data.use_imm, decode_if.data.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=0x%0h, opds=%b%b%b%b, use_pc=%b, use_imm=%b (#%0d)\n",
decode_if.data.op_mod, decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.imm, use_rd, use_rs1, use_rs2, use_rs3, decode_if.data.use_PC, decode_if.data.use_imm, decode_if.data.uuid));
end
end
`endif
endmodule

227
hw/rtl/core/VX_dispatch.sv Normal file
View File

@@ -0,0 +1,227 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
// outputs
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`ISSUE_WIDTH-1:0][`NT_WIDTH-1:0] last_active_tid;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.data_in (tids),
.data_out (last_active_tid[i]),
`UNUSED_PIN (valid_out)
);
end
// ALU dispatch
VX_operands_if alu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
assign alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (alu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) alu_buffer (
.clk (clk),
.reset (alu_reset),
.valid_in (alu_operands_if[i].valid),
.ready_in (alu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(alu_operands_if[i].data, last_active_tid[i])),
.data_out (alu_dispatch_if[i].data),
.valid_out (alu_dispatch_if[i].valid),
.ready_out (alu_dispatch_if[i].ready)
);
end
// LSU dispatch
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
assign lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (lsu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) lsu_buffer (
.clk (clk),
.reset (lsu_reset),
.valid_in (lsu_operands_if[i].valid),
.ready_in (lsu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(lsu_operands_if[i].data, last_active_tid[i])),
.data_out (lsu_dispatch_if[i].data),
.valid_out (lsu_dispatch_if[i].valid),
.ready_out (lsu_dispatch_if[i].ready)
);
end
// FPU dispatch
`ifdef EXT_F_ENABLE
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
assign fpu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (fpu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) fpu_buffer (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_operands_if[i].valid),
.ready_in (fpu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(fpu_operands_if[i].data, last_active_tid[i])),
.data_out (fpu_dispatch_if[i].data),
.valid_out (fpu_dispatch_if[i].valid),
.ready_out (fpu_dispatch_if[i].ready)
);
end
`endif
// SFU dispatch
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
assign sfu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (sfu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) sfu_buffer (
.clk (clk),
.reset (sfu_reset),
.valid_in (sfu_operands_if[i].valid),
.ready_in (sfu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(sfu_operands_if[i].data, last_active_tid[i])),
.data_out (sfu_dispatch_if[i].data),
.valid_out (sfu_dispatch_if[i].valid),
.ready_out (sfu_dispatch_if[i].ready)
);
end
// can take next request?
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
`ifdef EXT_F_ENABLE
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
`endif
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
end
`ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
wire [`ISSUE_WIDTH-1:0] operands_stall;
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
assign operands_ex_type[i] = operands_if[i].data.ex_type;
end
always @(*) begin
perf_stalls_n = perf_stalls_r;
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
if (operands_stall[i]) begin
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
end
end
end
always @(posedge clk) begin
if (reset) begin
perf_stalls_r <= '0;
end else begin
perf_stalls_r <= perf_stalls_n;
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View File

@@ -0,0 +1,256 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_dispatch_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0,
parameter MAX_FANOUT = `MAX_FANOUT
) (
input wire clk,
input wire reset,
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_execute_if.master execute_if [BLOCK_SIZE]
);
`STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS);
localparam PID_WIDTH = `UP(PID_BITS);
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
wire [BLOCK_SIZE-1:0] block_sop;
wire [BLOCK_SIZE-1:0] block_eop;
wire [BLOCK_SIZE-1:0] block_done;
wire batch_done = (& block_done);
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else if (batch_done) begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
end
end
end else begin
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
reg is_first_p;
wire fire_p = valid_p && ready_p;
wire is_last_p = (start_p == end_p);
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else if (fire_p) begin
sent_mask_p[start_p] <= 1;
is_first_p <= 0;
end
end
end
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
end
end
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (0)
) find_first (
.valid_in (packet_valids & ~sent_mask_p),
.data_in (packet_ids),
.data_out (start_p_n),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (1)
) find_last (
.valid_in (packet_valids),
.data_in (packet_ids),
.data_out (end_p),
`UNUSED_PIN (valid_out)
);
VX_pipe_register #(
.DATAW (1 + PID_WIDTH),
.RESETW (1),
.DEPTH (FANOUT_ENABLE ? 1 : 0)
) pipe_reg (
.clk (clk),
.reset (reset || fire_p), // should flush on fire
.enable (1'b1),
.data_in ({dispatch_valid[issue_idx], start_p_n}),
.data_out ({dispatch_valid_r, start_p})
);
wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
assign valid_p = dispatch_valid_r && block_enable;
assign block_tmask[block_idx] = tmask_p;
assign block_regs[block_idx] = regs_p;
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_pid[block_idx] = '0;
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
end
wire [ISSUE_IDX_W-1:0] wsi;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
assign wsi = batch_idx;
end
end else begin
assign wsi = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
) buf_out (
.clk (clk),
.reset (buf_out_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
block_wid,
block_tmask[block_idx],
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
block_regs[block_idx][0],
block_regs[block_idx][1],
block_regs[block_idx][2],
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
end
end
assign dispatch_ready = ready_in;
endmodule

137
hw/rtl/core/VX_execute.sv Normal file
View File

@@ -0,0 +1,137 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
// commit interface
VX_commit_csr_if.slave commit_csr_if,
// fetch interface
VX_sched_csr_if.slave sched_csr_if,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if,
// simulation helper signals
output wire sim_ebreak
);
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
.dispatch_if (alu_dispatch_if),
.branch_ctl_if (branch_ctl_if),
.commit_if (alu_commit_if)
);
`SCOPE_IO_SWITCH (1)
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.cache_bus_if (dcache_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_commit_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.dispatch_if (fpu_dispatch_if),
.fpu_to_csr_if (fpu_to_csr_if),
.commit_if (fpu_commit_if)
);
`endif
VX_sfu_unit #(
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (sfu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.warp_ctl_if (warp_ctl_if),
.commit_if (sfu_commit_if)
);
// simulation helper signal to get RISC-V tests Pass/Fail status
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
&& alu_dispatch_if[0].data.wis == 0
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
endmodule

184
hw/rtl/core/VX_fetch.sv Normal file
View File

@@ -0,0 +1,184 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Icache interface
VX_mem_bus_if.master icache_bus_if,
// inputs
VX_schedule_if.slave schedule_if,
// outputs
VX_fetch_if.master fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
wire icache_req_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
wire icache_req_fire = icache_req_valid && icache_req_ready;
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
assign req_tag = schedule_if.data.wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (`XLEN + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.decr (fetch_if.ibuf_pop[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
end
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready;
VX_elastic_buffer #(
.DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH),
.SIZE (2),
.OUT_REG (1) // external bus should be registered
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (icache_req_valid),
.ready_in (icache_req_ready),
.data_in ({icache_req_addr, icache_req_tag}),
.data_out ({icache_bus_if.req_data.addr, icache_bus_if.req_data.tag}),
.valid_out (icache_bus_if.req_valid),
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0;
// Icache Response
assign fetch_if.valid = icache_bus_if.rsp_valid;
assign fetch_if.data.tmask = rsp_tmask;
assign fetch_if.data.wid = rsp_tag;
assign fetch_if.data.PC = rsp_PC;
assign fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (3*`UUID_WIDTH + 108)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_ICACHE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,129 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_gather_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave commit_in_if [BLOCK_SIZE],
// outputs
VX_commit_if.master commit_out_if [`ISSUE_WIDTH]
);
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
wire [BLOCK_SIZE-1:0] commit_in_valid;
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid;
assign commit_in_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
end
end else begin
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
end
end
reg [`ISSUE_WIDTH-1:0] commit_out_valid;
reg [`ISSUE_WIDTH-1:0][DATAW-1:0] commit_out_data;
wire [`ISSUE_WIDTH-1:0] commit_out_ready;
always @(*) begin
commit_out_valid = '0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
commit_out_data[i] = 'x;
end
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
) out_buf (
.clk (clk),
.reset (commit_out_reset),
.valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]),
.data_out (commit_tmp_if.data),
.valid_out (commit_tmp_if.valid),
.ready_out (commit_tmp_if.ready)
);
logic [`NUM_THREADS-1:0] commit_tmask_r;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
if (PID_BITS != 0) begin
always @(*) begin
commit_tmask_r = '0;
commit_data_r = 'x;
for (integer j = 0; j < NUM_LANES; ++j) begin
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
end
end
end else begin
assign commit_tmask_r = commit_tmp_if.data.tmask;
assign commit_data_r = commit_tmp_if.data.data;
end
assign commit_out_if[i].valid = commit_tmp_if.valid;
assign commit_out_if[i].data = {
commit_tmp_if.data.uuid,
commit_tmp_if.data.wid,
commit_tmask_r,
commit_tmp_if.data.PC,
commit_tmp_if.data.wb,
commit_tmp_if.data.rd,
commit_data_r,
1'b0, // PID
commit_tmp_if.data.sop,
commit_tmp_if.data.eop
};
assign commit_tmp_if.ready = commit_out_if[i].ready;
end
endmodule

73
hw/rtl/core/VX_ibuffer.sv Normal file
View File

@@ -0,0 +1,73 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
wire [`ISSUE_WIDTH-1:0] ibuf_ready_in;
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
assign decode_if.ready = ibuf_ready_in[decode_isw];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_isw == i),
.ready_in (ibuf_ready_in[i]),
.data_in ({
decode_if.data.uuid,
decode_wis,
decode_if.data.tmask,
decode_if.data.ex_type,
decode_if.data.op_type,
decode_if.data.op_mod,
decode_if.data.wb,
decode_if.data.use_PC,
decode_if.data.use_imm,
decode_if.data.PC,
decode_if.data.imm,
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
.data_out(ibuffer_if[i].data),
.valid_out (ibuffer_if[i].valid),
.ready_out(ibuffer_if[i].ready)
);
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
end
endmodule

191
hw/rtl/core/VX_int_unit.sv Normal file
View File

@@ -0,0 +1,191 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_int_unit #(
parameter CORE_ID = 0,
parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if,
VX_branch_ctl_if.master branch_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam SHIFT_IMM_BITS = `CLOG2(`XLEN);
`UNUSED_VAR (execute_if.data.rs3_data)
wire [NUM_LANES-1:0][`XLEN-1:0] add_result;
wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result;
wire [NUM_LANES-1:0][`XLEN-1:0] add_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] sub_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] alu_result;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
`else
wire is_alu_w = 0;
`endif
`UNUSED_VAR (execute_if.data.op_mod)
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
wire is_br_op = `INST_ALU_IS_BR(execute_if.data.op_mod);
wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
wire is_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.use_PC ? {NUM_LANES{execute_if.data.PC}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.use_imm ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.use_imm && ~is_br_op) ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
assign sub_result[i] = sub_in1 - sub_in2;
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
assign shr_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]);
wire [32:0] shr_in1_w = {is_signed && alu_in1[i][31], alu_in1[i][31:0]};
wire [31:0] shr_res_w = 32'($signed(shr_in1_w) >>> alu_in2_imm[i][4:0]);
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
case (alu_op[1:0])
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
2'b01: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; // OR
2'b10: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; // XOR
2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL
endcase
end
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_result[i]; // SRL, SRA, SRLI, SRAI
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW
3'b111: alu_result[i] = msc_result_w[i]; // SLLW
endcase
end
end
// branch
wire [`XLEN-1:0] PC_r, imm_r;
wire [`INST_BR_BITS-1:0] br_op_r;
wire [LANE_WIDTH-1:0] tid, tid_r;
wire is_br_op_r;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
assign tid = 0;
end
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
`UNUSED_VAR (br_op_r)
wire is_br_neg = `INST_BR_IS_NEG(br_op_r);
wire is_br_less = `INST_BR_IS_LESS(br_op_r);
wire is_br_static = `INST_BR_IS_STATIC(br_op_r);
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire is_less = br_result[0];
wire is_equal = br_result[1];
wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop;
wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static;
wire [`XLEN-1:0] br_dest = is_br_static ? br_result : (PC_r + imm_r);
wire [`NW_WIDTH-1:0] br_wid;
`ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS)
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + 1 + `XLEN)
) branch_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i];
end
assign commit_if.data.PC = PC_r;
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, commit_if.data.PC, branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,108 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
input wire reset,
input wire [WIDTH-1:0] q0,
input wire [WIDTH-1:0] q1,
output wire [WIDTH-1:0] d,
output wire d_set,
input wire push,
input wire pop,
output wire empty,
output wire full
);
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
wire [WIDTH-1:0] d0, d1;
wire d_set_n = slot_set[rd_ptr];
always @(posedge clk) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
full_r <= 0;
end
end
end
VX_dp_ram #(
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (OUT_REG ? 1 : 0),
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in (d_set_n),
.data_out (d_set_r)
);
assign d = d_set_r ? d0 : d1;
assign d_set = ~d_set_r;
assign empty = empty_r;
assign full = full_r;
endmodule

180
hw/rtl/core/VX_issue.sv Normal file
View File

@@ -0,0 +1,180 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`ISSUE_WIDTH]();
VX_ibuffer_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
.perf_stalls (perf_issue_if.dsp_stalls),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS +
1 + `NR_BITS + `XLEN + 1 + 1 + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.op_mod,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.imm,
operands_if[0].data.use_PC,
operands_if[0].data.use_imm,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif
endmodule

647
hw/rtl/core/VX_lsu_unit.sv Normal file
View File

@@ -0,0 +1,647 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Dcache interface
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH]
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_st_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if[0].data.op_mod)
`UNUSED_VAR (execute_if[0].data.use_PC)
`UNUSED_VAR (execute_if[0].data.use_imm)
`UNUSED_VAR (execute_if[0].data.rs3_data)
`UNUSED_VAR (execute_if[0].data.tid)
`ifdef SM_ENABLE
`STATIC_ASSERT((1 << `SMEM_LOG_SIZE) == `MEM_BLOCK_SIZE * ((1 << `SMEM_LOG_SIZE) / `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
`endif
// tag = uuid + addr_type + wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if[0].data.rs1_data[i][`XLEN-1:0] + execute_if[0].data.imm;
end
// detect duplicate addresses
wire lsu_is_dup;
`ifdef LSU_DUP
if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
assign addr_matches[i] = (execute_if[0].data.rs1_data[i+1] == execute_if[0].data.rs1_data[0]) || ~execute_if[0].data.tmask[i+1];
end
assign lsu_is_dup = execute_if[0].data.tmask[0] && (& addr_matches);
end else begin
assign lsu_is_dup = 0;
end
`else
assign lsu_is_dup = 0;
`endif
// detect address type
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is non-cacheable I/O address
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
`ifdef SM_ENABLE
// is shared memory address
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
`else
assign lsu_addr_type[i] = is_addr_io;
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if[0].valid && ~fence_wait;
assign execute_if[0].ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_req_mask[i] = execute_if[0].data.tmask[i] && (~lsu_is_dup || (i == 0));
end
assign mem_req_rw = ~execute_if[0].data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if[0].data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if[0].valid && execute_if[0].ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if[0].data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if[0].data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if[0].data.wid, execute_if[0].data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if[0].data.op_type), execute_if[0].data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if[0].data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if[0].data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if[0].data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if[0].data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if[0].data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if[0].data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if[0].data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if[0].data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if[0].data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if[0].data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if[0].data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP
, lsu_is_dup
`endif
};
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
.NUM_REQS (LSU_MEM_REQS),
.NUM_BANKS (DCACHE_NUM_REQS),
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
.DATA_WIDTH (`XLEN),
.QUEUE_SIZE (`LSUQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.MEM_TAG_ID (`UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS)),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_REG (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.req_valid (mem_req_valid),
.req_rw (mem_req_rw),
.req_mask (mem_req_mask),
.req_byteen (mem_req_byteen),
.req_addr (mem_req_addr),
.req_data (mem_req_data),
.req_tag (mem_req_tag),
.req_empty (mem_req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
.rsp_sop (mem_rsp_sop),
.rsp_eop (mem_rsp_eop),
.rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
// cache tag formatting: <uuid, tag, type>
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
if (DCACHE_NUM_BATCHES > 1) begin
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * DCACHE_NUM_REQS + i;
if (k < NUM_LANES) begin
assign cache_req_type_b[j] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j];
end else begin
assign cache_req_type_b[j] = '0;
`UNUSED_VAR (cache_rsp_type_b[j])
end
end
end else begin
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type[j])
assign cache_rsp_type[j] = '0;
end
end
end
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [NUM_LANES-1:0] rsp_tmask_uq;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup;
`ifndef LSU_DUP
assign rsp_is_dup = 0;
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP
, rsp_is_dup
`endif
} = mem_rsp_tag;
`UNUSED_VAR (rsp_addr_type)
`UNUSED_VAR (rsp_op_type)
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
wire [NUM_LANES-1:0] rsp_tmask;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? (rsp_align[0][2] ? mem_rsp_data[0][63:32] : mem_rsp_data[0][31:0]) :
(rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
assign rsp_tmask = rsp_is_dup ? rsp_tmask_uq : mem_rsp_mask;
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_arb_if[1]();
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (1)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_arb_if[0].data),
.valid_out (commit_arb_if[0].valid),
.ready_out (commit_arb_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_arb_if),
.commit_out_if (commit_if)
);
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if[0].data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, rsp_tmask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if[0].valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, tag=0x%0h, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, mem_rsp_tag, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, mem_rsp_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid));
end
end
`endif
endmodule

View File

@@ -0,0 +1,336 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_muldiv_unit #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAGW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
wire [`INST_M_BITS-1:0] muldiv_op = `INST_M_BITS'(execute_if.data.op_type);
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
`else
wire is_alu_w = 0;
`endif
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_out;
wire [`UUID_WIDTH-1:0] mul_uuid_out;
wire [`NW_WIDTH-1:0] mul_wid_out;
wire [NUM_LANES-1:0] mul_tmask_out;
wire [`XLEN-1:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire [PID_WIDTH-1:0] mul_pid_out;
wire mul_sop_out, mul_eop_out;
wire mul_valid_in = execute_if.valid && is_mulx_op;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
wire is_mulh_in = `INST_M_IS_MULH(muldiv_op);
wire is_signed_mul_a = `INST_M_SIGNED_A(muldiv_op);
wire is_signed_mul_b = is_signed_op;
`ifdef IMUL_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
dpi_imul (mul_fire_in, is_signed_mul_a, is_signed_mul_b, mul_in1, mul_in2, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : (is_alu_w ? `XLEN'($signed(mul_resultl[31:0])) : mul_resultl);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`else
wire [NUM_LANES-1:0][2*(`XLEN+1)-1:0] mul_result_tmp;
wire is_mulh_out;
wire is_mul_w_out;
`ifdef XLEN_64
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
wire mul_strode;
wire mul_busy;
VX_elastic_adapter mul_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out),
.strobe (mul_strode),
.busy (mul_busy)
);
VX_serial_mul #(
.A_WIDTH (`XLEN+1),
.LANES (NUM_LANES),
.SIGNED (1)
) serial_mul (
.clk (clk),
.reset (reset),
.strobe (mul_strode),
.busy (mul_busy),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
reg [TAGW+2-1:0] mul_tag_r;
always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out, mul_pid_out, mul_sop_out, mul_eop_out} = mul_tag_r;
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
VX_multiplier #(
.A_WIDTH (`XLEN+1),
.B_WIDTH (`XLEN+1),
.R_WIDTH (2*(`XLEN+1)),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp[i])
);
end
VX_shift_register #(
.DATAW (1 + TAGW + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, is_mulh_in, is_alu_w}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, is_mulh_out, is_mul_w_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
mul_result_tmp[i][`XLEN-1:0]);
`else
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] : mul_result_tmp[i][`XLEN-1:0];
`UNUSED_VAR (is_mul_w_out)
`endif
end
`endif
///////////////////////////////////////////////////////////////////////////
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_out;
wire [`UUID_WIDTH-1:0] div_uuid_out;
wire [`NW_WIDTH-1:0] div_wid_out;
wire [NUM_LANES-1:0] div_tmask_out;
wire [`XLEN-1:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire [PID_WIDTH-1:0] div_pid_out;
wire div_sop_out, div_eop_out;
wire is_rem_op = `INST_M_IS_REM(muldiv_op);
wire div_valid_in = execute_if.valid && ~is_mulx_op;
wire div_ready_in;
wire div_valid_out;
wire div_ready_out;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
end
`ifdef IDIV_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end
assign div_result_in[i] = is_rem_op ? (is_alu_w ? `XLEN'($signed(div_remainder[31:0])) : div_remainder) :
(is_alu_w ? `XLEN'($signed(div_quotient[31:0])) : div_quotient);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
.clk(clk),
.reset (reset),
.enable (div_ready_in),
.data_in ({div_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, div_result_in}),
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out})
);
assign div_ready_in = div_ready_out || ~div_valid_out;
`else
wire [NUM_LANES-1:0][`XLEN-1:0] div_quotient, div_remainder;
wire is_rem_op_out;
wire is_div_w_out;
wire div_strode;
wire div_busy;
VX_elastic_adapter div_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (div_valid_in),
.ready_in (div_ready_in),
.valid_out (div_valid_out),
.ready_out (div_ready_out),
.strobe (div_strode),
.busy (div_busy)
);
VX_serial_div #(
.WIDTHN (`XLEN),
.WIDTHD (`XLEN),
.WIDTHQ (`XLEN),
.WIDTHR (`XLEN),
.LANES (NUM_LANES)
) serial_div (
.clk (clk),
.reset (reset),
.strobe (div_strode),
.busy (div_busy),
.is_signed (is_signed_op),
.numer (div_in1),
.denom (div_in2),
.quotient (div_quotient),
.remainder (div_remainder)
);
reg [TAGW+2-1:0] div_tag_r;
always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin
div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
`else
assign div_result_out[i] = is_rem_op_out ? div_remainder[i] : div_quotient[i];
`UNUSED_VAR (is_div_w_out)
`endif
end
`endif
// can accept new request?
assign execute_if.ready = is_mulx_op ? mul_ready_in : div_ready_in;
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAGW + (NUM_LANES * `XLEN)),
.OUT_REG (1)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in ({div_valid_out, mul_valid_out}),
.ready_in ({div_ready_out, mul_ready_out}),
.data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out},
{mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

302
hw/rtl/core/VX_operands.sv Normal file
View File

@@ -0,0 +1,302 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if();
always @(*) begin
state_n = state;
rs2_n = rs2;
rs3_n = rs3;
rs2_ready_n = rs2_ready;
rs3_ready_n = rs3_ready;
rs1_data_n = rs1_data;
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
case (state)
STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs3 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs2 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs1 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if[i].data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if[i].data.wis;
rs2_n = scoreboard_if[i].data.rs2;
rs3_n = scoreboard_if[i].data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if[i].data.tmask[j]) begin
cache_data_n[writeback_if[i].data.wis][j] = writeback_if[i].data.data[j];
end
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
if (writeback_if[i].data.sop) begin
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
end else begin
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
end
end
end
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
cache_reg <= '0;
data_ready <= 0;
end else begin
state <= state_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
end
end
// GPR banks
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
.wdata (writeback_if[i].data.data[j]),
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
.rdata (gpr_rd_data[j])
);
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end
endmodule

View File

@@ -0,0 +1,79 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pending_instr #(
parameter CTR_WIDTH = 12,
parameter ALM_EMPTY = 1,
parameter DECR_COUNT = 1
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] incr_wid,
input wire [DECR_COUNT-1:0] decr,
input wire [DECR_COUNT-1:0][`NW_WIDTH-1:0] decr_wid,
input wire [`NW_WIDTH-1:0] alm_empty_wid,
output wire empty,
output wire alm_empty
);
localparam COUNTW = `CLOG2(DECR_COUNT+1);
reg [`NUM_WARPS-1:0][CTR_WIDTH-1:0] pending_instrs;
reg [`NUM_WARPS-1:0][COUNTW-1:0] decr_cnt;
reg [`NUM_WARPS-1:0][DECR_COUNT-1:0] decr_mask;
reg [`NUM_WARPS-1:0] incr_cnt, incr_cnt_n;
reg [`NUM_WARPS-1:0] alm_empty_r, empty_r;
always @(*) begin
incr_cnt_n = 0;
decr_mask = 0;
if (incr) begin
incr_cnt_n[incr_wid] = 1;
end
for (integer i = 0; i < DECR_COUNT; ++i) begin
if (decr[i]) begin
decr_mask[decr_wid[i]][i] = 1;
end
end
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire [COUNTW-1:0] decr_cnt_n;
`POP_COUNT(decr_cnt_n, decr_mask[i]);
wire [CTR_WIDTH-1:0] pending_instrs_n = pending_instrs[i] + CTR_WIDTH'(incr_cnt[i]) - CTR_WIDTH'(decr_cnt[i]);
always @(posedge clk) begin
if (reset) begin
incr_cnt[i] <= '0;
decr_cnt[i] <= '0;
pending_instrs[i] <= '0;
alm_empty_r[i] <= 0;
empty_r[i] <= 1;
end else begin
incr_cnt[i] <= incr_cnt_n[i];
decr_cnt[i] <= decr_cnt_n;
pending_instrs[i] <= pending_instrs_n;
alm_empty_r[i] <= (pending_instrs_n == ALM_EMPTY);
empty_r[i] <= (pending_instrs_n == 0);
end
end
end
assign alm_empty = alm_empty_r[alm_empty_wid];
assign empty = (& empty_r);
endmodule

379
hw/rtl/core/VX_schedule.sv Normal file
View File

@@ -0,0 +1,379 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// configuration
input base_dcrs_t base_dcrs,
// inputsdecode_if
VX_warp_ctl_if.slave warp_ctl_if,
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
VX_decode_sched_if.slave decode_sched_if,
VX_commit_sched_if.slave commit_sched_if,
// outputs
VX_schedule_if.master schedule_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
VX_sched_csr_if.master sched_csr_if,
// status
output wire busy
);
`UNUSED_PARAM (CORE_ID)
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
wire [`NW_WIDTH-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [`XLEN-1:0] schedule_pc;
wire schedule_valid;
wire schedule_ready;
// split/join
wire join_valid;
wire join_is_dvg;
wire join_is_else;
wire [`NW_WIDTH-1:0] join_wid;
wire [`NUM_THREADS-1:0] join_tmask;
wire [`XLEN-1:0] join_pc;
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
// branch
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
assign branch_dest[i] = branch_ctl_if[i].dest;
end
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
wire [`NUM_WARPS-1:0] curr_barrier_mask;
`ifdef GBAR_ENABLE
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
reg gbar_req_valid;
reg [`NB_WIDTH-1:0] gbar_req_id;
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
`endif
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`UNUSED_VAR (active_barrier_count)
always @(*) begin
active_warps_n = active_warps;
stalled_warps_n = stalled_warps;
thread_masks_n = thread_masks;
barrier_masks_n = barrier_masks;
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// wspawn handling
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n |= warp_ctl_if.wspawn.wmask;
for (integer i = 0; i < `NUM_WARPS; ++i) begin
if (warp_ctl_if.wspawn.wmask[i]) begin
thread_masks_n[i][0] = 1;
warp_pcs_n[i] = warp_ctl_if.wspawn.pc;
end
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// TMC handling
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.tmc.tmask;
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// split handling
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
if (warp_ctl_if.split.is_dvg) begin
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.split.then_tmask;
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// join handling
if (join_valid) begin
if (join_is_dvg) begin
if (join_is_else) begin
warp_pcs_n[join_wid] = join_pc;
end
thread_masks_n[join_wid] = join_tmask;
end
stalled_warps_n[join_wid] = 0; // unlock warp
end
// barrier handling
`ifdef GBAR_ENABLE
curr_barrier_mask_n = curr_barrier_mask;
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
`endif
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
if (~warp_ctl_if.barrier.is_global
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
end else begin
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
barrier_stalls_n[warp_ctl_if.wid] = 1;
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
barrier_stalls_n = '0; // unlock all warps
end
`endif
// Branch handling
for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
if (branch_valid[i]) begin
if (branch_taken[i]) begin
warp_pcs_n[branch_wid[i]] = branch_dest[i];
end
stalled_warps_n[branch_wid[i]] = 0; // unlock warp
end
end
// decode unlock
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
end
// advance PC
if (schedule_if_fire) begin
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
end
end
`UNUSED_VAR (base_dcrs)
always @(posedge clk) begin
if (reset) begin
barrier_masks <= '0;
`ifdef GBAR_ENABLE
gbar_req_valid <= 0;
`endif
stalled_warps <= '0;
warp_pcs <= '0;
active_warps <= '0;
thread_masks <= '0;
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
// activate first warp
warp_pcs[0] <= base_dcrs.startup_addr;
active_warps[0] <= 1;
thread_masks[0][0] <= 1;
end else begin
active_warps <= active_warps_n;
stalled_warps <= stalled_warps_n;
thread_masks <= thread_masks_n;
warp_pcs <= warp_pcs_n;
barrier_masks <= barrier_masks_n;
barrier_stalls <= barrier_stalls_n;
// global barrier scheduling
`ifdef GBAR_ENABLE
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
&& warp_ctl_if.barrier.is_global
&& (curr_barrier_mask_n == active_warps)) begin
gbar_req_valid <= 1;
gbar_req_id <= warp_ctl_if.barrier.id;
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
end
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
gbar_req_valid <= 0;
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
cycles <= cycles + 1;
end
end
end
// barrier handling
`ifdef GBAR_ENABLE
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif
// split/join handling
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.CORE_ID (CORE_ID)
) split_join (
.clk (clk),
.reset (split_join_reset),
.valid (warp_ctl_if.valid),
.wid (warp_ctl_if.wid),
.split (warp_ctl_if.split),
.sjoin (warp_ctl_if.sjoin),
.join_valid (join_valid),
.join_is_dvg (join_is_dvg),
.join_is_else (join_is_else),
.join_wid (join_wid),
.join_tmask (join_tmask),
.join_pc (join_pc)
);
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS),
.REVERSE (1)
) wid_select (
.data_in (ready_warps),
.data_out (schedule_wid),
.valid_out (schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
assign {schedule_tmask, schedule_pc} = {
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
end
end
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
`RESET_RELAY (pending_instr_reset, reset);
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
.ALM_EMPTY (1)
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
);
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
// export CSRs
assign sched_csr_if.cycles = cycles;
assign sched_csr_if.active_warps = active_warps;
assign sched_csr_if.thread_masks = thread_masks;
// timeout handling
reg [31:0] timeout_ctr;
reg timeout_enable;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
timeout_enable <= 0;
end else begin
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
timeout_enable <= 1;
end
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin
timeout_ctr <= timeout_ctr + 1;
end else if (active_warps == 0 || active_warps != stalled_warps) begin
timeout_ctr <= '0;
end
end
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
endmodule

Some files were not shown because too many files have changed in this diff Show More