constant integration updates, cache bank incoming_fill optimization

This commit is contained in:
Blaise Tine
2020-11-15 23:01:24 -08:00
parent 2904f6441d
commit 1bc4b8e7a8
3 changed files with 135 additions and 51 deletions

View File

@@ -1,7 +1,7 @@
language: cpp language: cpp
dist: bionic dist: bionic
os: linux os: linux
compiler: clang compiler: gcc
addons: addons:
apt: apt:
sources: sources:
@@ -20,12 +20,19 @@ install:
- export PATH=$VERILATOR_ROOT/bin:$PATH - export PATH=$VERILATOR_ROOT/bin:$PATH
script: script:
- make -j > /dev/null 2>&1 - make -j
- ci/test_runtime.sh - ci/test_runtime.sh
- ci/test_driver.sh - ci/test_driver.sh
- ci/test_riscv_isa.sh - ci/test_riscv_isa.sh
- ci/test_opencl.sh - ci/test_opencl.sh
- ci/blackbox.sh - ci/blackbox.sh -run_1c
- ci/blackbox.sh -run_2c
- ci/blackbox.sh -run_4c
- ci/blackbox.sh -run_4c_l2
- ci/blackbox.sh -run_8c_2l2
- ci/blackbox.sh -run_16c_4l2_l3
- ci/blackbox.sh -run_debug
- ci/blackbox.sh -run_scope
after_success: after_success:
# Gather code coverage # Gather code coverage

View File

@@ -1,41 +1,112 @@
#!/bin/sh #!/bin/sh
# test single core run_1c()
make -C driver/opae/vlsim clean {
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1 # test single core
make -C driver/tests/dogfood run-vlsim make -C driver/opae/vlsim clean
make -C benchmarks/opencl/sgemm run-vlsim CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 2 cores run_2c()
make -C driver/opae/vlsim clean {
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0" make -C driver/opae/vlsim > /dev/null 2>&1 # test 2 cores
make -C driver/tests/dogfood run-vlsim make -C driver/opae/vlsim clean
make -C benchmarks/opencl/sgemm run-vlsim CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 4 cores with L2 run_4c()
make -C driver/opae/vlsim clean {
CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 # test 4 cores
make -C driver/tests/dogfood run-vlsim make -C driver/opae/vlsim clean
make -C benchmarks/opencl/sgemm run-vlsim CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 8 cores with L2 run_4c_l2()
make -C driver/opae/vlsim clean {
CONFIGS="-DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 # test 4 cores with L2
make -C driver/tests/dogfood run-vlsim make -C driver/opae/vlsim clean
make -C benchmarks/opencl/sgemm run-vlsim CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test 16 cores with L2 and L3 run_8c_2l2()
make -C driver/opae/vlsim clean {
CONFIGS="-DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=1" make -C driver/opae/vlsim > /dev/null 2>&1 # test 8 cores with 2xL2
make -C driver/tests/dogfood run-vlsim make -C driver/opae/vlsim clean
make -C benchmarks/opencl/sgemm run-vlsim CONFIGS="-DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test debug build run_16c_4l2_l3()
make -C driver/opae/vlsim clean {
DEBUG=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim > /dev/null 2>&1 # test 16 cores with L2 and L3
make -C driver/tests/demo run-vlsim make -C driver/opae/vlsim clean
CONFIGS="-DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=1" make -C driver/opae/vlsim
make -C driver/tests/dogfood run-vlsim
make -C benchmarks/opencl/sgemm run-vlsim
}
# test build with scope analyzer run_debug()
make -C driver/opae clean {
SCOPE=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae > /dev/null 2>&1 # test debug build
make -C driver/tests/demo run-vlsim make -C driver/opae/vlsim clean
DEBUG=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae/vlsim
make -C driver/tests/demo run-vlsim
}
run_scope()
{
# test build with scope analyzer
make -C driver/opae clean
SCOPE=1 CONFIGS="-DNUM_CLUSTERS=1 -DNUM_CORES=1" make -C driver/opae
make -C driver/tests/demo run-vlsim
}
usage()
{
echo "usage: blackbox [[-run_1c] [-run_2c] [-run_4c] [-run_4c_l2] [-run_8c_2l2] [-run_16c_4l2_l3] [-run_debug] [-run_scope] [-all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
-run_1c ) run_1c
;;
-run_2c ) run_2c
;;
-run_4c ) run_4c
;;
-run_4c_l2 ) run_4c_l2
;;
-run_8c_2l2 ) run_8c_2l2
;;
-run_16c_4l2_l3 ) run_16c_4l2_l3
;;
-run_debug ) run_debug
;;
-run_scope ) run_scope
;;
-all ) run_1c
run_2c
run_4c
run_4c_l2
run_8c_2l2
run_16c_4l2_l3
run_debug
run_scope
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done

View File

@@ -168,11 +168,10 @@ module VX_bank #(
wire [`LINE_ADDR_WIDTH-1:0] dfpq_addr_st0; wire [`LINE_ADDR_WIDTH-1:0] dfpq_addr_st0;
wire [`BANK_LINE_WIDTH-1:0] dfpq_filldata_st0; wire [`BANK_LINE_WIDTH-1:0] dfpq_filldata_st0;
wire dram_rsp_fire = dram_rsp_valid && dram_rsp_ready;
assign dram_rsp_ready = !dfpq_full; assign dram_rsp_ready = !dfpq_full;
if (DRAM_ENABLE) begin if (DRAM_ENABLE) begin
wire dram_rsp_fire = dram_rsp_valid && dram_rsp_ready;
VX_generic_queue #( VX_generic_queue #(
.DATAW(`LINE_ADDR_WIDTH + $bits(dram_rsp_data)), .DATAW(`LINE_ADDR_WIDTH + $bits(dram_rsp_data)),
.SIZE(DRFQ_SIZE) .SIZE(DRFQ_SIZE)
@@ -535,9 +534,10 @@ module VX_bank #(
wire snp_invalidate_st3; wire snp_invalidate_st3;
wire is_msrq_st3; wire is_msrq_st3;
wire send_core_rsp_st3; wire send_core_rsp_st3;
wire send_fill_req_st3; wire send_dwb_req_st3;
wire do_writeback_st3; wire do_writeback_st3;
wire send_snp_rsp_st3; wire send_snp_rsp_st3;
wire incoming_fill_st3;
wire send_core_rsp_st2 = valid_st2 && !is_fill_st2 && !is_snp_st2 && !miss_st2 && !force_miss_st2; wire send_core_rsp_st2 = valid_st2 && !is_fill_st2 && !is_snp_st2 && !miss_st2 && !force_miss_st2;
@@ -549,17 +549,27 @@ module VX_bank #(
&& (is_fill_st2 && (is_fill_st2
|| (!force_miss_st2 && is_snp_st2)); || (!force_miss_st2 && is_snp_st2));
wire send_dwb_req_st2 = send_fill_req_st2 || do_writeback_st2;
wire send_snp_rsp_st2 = valid_st2 && is_snp_st2 && !force_miss_st2; wire send_snp_rsp_st2 = valid_st2 && is_snp_st2 && !force_miss_st2;
// check if a matching fill request is comming
wire incoming_fill_dfp_st2 = dram_rsp_fire && (addr_st2 == dram_rsp_addr);
wire incoming_fill_st0_st2 = !dfpq_empty && (addr_st2 == dfpq_addr_st0);
wire incoming_fill_st1_st2 = is_fill_st1 && (addr_st2 == addr_st1);
wire incoming_fill_st2 = incoming_fill_dfp_st2
|| incoming_fill_st0_st2
|| incoming_fill_st1_st2;
VX_generic_register #( VX_generic_register #(
.N(1+ 1+ 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `WORD_WIDTH + `BANK_LINE_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + `REQ_INST_META_WIDTH) .N(1+ 1+ 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `WORD_WIDTH + `BANK_LINE_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + `REQ_INST_META_WIDTH)
) pipe_reg2 ( ) pipe_reg2 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.stall (pipeline_stall), .stall (pipeline_stall),
.flush (1'b0), .flush (1'b0),
.in ({is_msrq_st2, send_core_rsp_st2, send_fill_req_st2, do_writeback_st2, send_snp_rsp_st2, force_miss_st2, is_snp_st2, snp_invalidate_st2, valid_st2, addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirtyb_st2, inst_meta_st2}), .in ({is_msrq_st2, incoming_fill_st2, send_core_rsp_st2, send_dwb_req_st2, do_writeback_st2, send_snp_rsp_st2, force_miss_st2, is_snp_st2, snp_invalidate_st2, valid_st2, addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirtyb_st2, inst_meta_st2}),
.out ({is_msrq_st3, send_core_rsp_st3, send_fill_req_st3, do_writeback_st3, send_snp_rsp_st3, force_miss_st3, is_snp_st3, snp_invalidate_st3, valid_st3, addr_st3, wsel_st3, writeword_st3, readword_st3, readdata_st3, readtag_st3, miss_st3, dirtyb_st3, inst_meta_st3}) .out ({is_msrq_st3, incoming_fill_st3, send_core_rsp_st3, send_dwb_req_st3, do_writeback_st3, send_snp_rsp_st3, force_miss_st3, is_snp_st3, snp_invalidate_st3, valid_st3, addr_st3, wsel_st3, writeword_st3, readword_st3, readdata_st3, readtag_st3, miss_st3, dirtyb_st3, inst_meta_st3})
); );
`ifdef DBG_CACHE_REQ_INFO `ifdef DBG_CACHE_REQ_INFO
@@ -590,13 +600,9 @@ module VX_bank #(
assign {req_tag_st3, req_rw_st3, req_byteen_st3, req_tid_st3} = inst_meta_st3; assign {req_tag_st3, req_rw_st3, req_byteen_st3, req_tid_st3} = inst_meta_st3;
// check if a matching fill request is comming wire incoming_fill_dfp_st3 = dram_rsp_fire && (addr_st3 == dram_rsp_addr);
wire incoming_st0_fill_st3 = is_fill_st0 && (addr_st3 == dfpq_addr_st0); wire incoming_fill = incoming_fill_dfp_st3
wire incoming_st1_fill_st3 = is_fill_st1 && (addr_st3 == addr_st1); || incoming_fill_st3;
wire incoming_st2_fill_st3 = is_fill_st2 && (addr_st3 == addr_st2);
wire incoming_fill = incoming_st2_fill_st3
|| incoming_st1_fill_st3
|| incoming_st0_fill_st3;
if (DRAM_ENABLE) begin if (DRAM_ENABLE) begin
wire msrq_dequeue_st3 = valid_st3 && is_msrq_st3 && !msrq_push_unqual && !pipeline_stall; wire msrq_dequeue_st3 = valid_st3 && is_msrq_st3 && !msrq_push_unqual && !pipeline_stall;
@@ -732,12 +738,12 @@ module VX_bank #(
wire dwbq_empty, dwbq_full; wire dwbq_empty, dwbq_full;
wire dwbq_push_unqual = send_fill_req_st3 || do_writeback_st3; wire dwbq_push_unqual = send_dwb_req_st3;
assign dwbq_push_stall = dwbq_push_unqual && dwbq_full; assign dwbq_push_stall = dwbq_push_unqual && dwbq_full;
wire dwbq_push = dwbq_push_unqual wire dwbq_push = dwbq_push_unqual
&& !(send_fill_req_st3 && incoming_fill) // not in 'dwbq_push_stall' to reduce clock delay && (do_writeback_st3 || !incoming_fill) // not in 'dwbq_push_stall' to reduce clock delay
&& !dwbq_full && !dwbq_full
&& !msrq_push_stall && !msrq_push_stall
&& !cwbq_push_stall && !cwbq_push_stall