prefetch test fixes
This commit is contained in:
@@ -195,6 +195,7 @@ static const char* op_string(const Instr &instr) {
|
|||||||
case 2: return "SPLIT";
|
case 2: return "SPLIT";
|
||||||
case 3: return "JOIN";
|
case 3: return "JOIN";
|
||||||
case 4: return "BAR";
|
case 4: return "BAR";
|
||||||
|
case 5: return "PREFETCH";
|
||||||
default:
|
default:
|
||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -425,11 +425,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
for (int t = 0; t < num_threads; ++t) {
|
for (int t = 0; t < num_threads; ++t) {
|
||||||
if (!tmask_.test(t))
|
if (!tmask_.test(t))
|
||||||
continue;
|
continue;
|
||||||
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
|
Word mem_addr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
|
||||||
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
||||||
Word data_read = core_->dcache_read(memAddr, 4);
|
Word data_read = core_->dcache_read(mem_addr, 4);
|
||||||
trace->mem_addrs.at(t).push_back({memAddr, 4});
|
trace->mem_addrs.at(t).push_back({mem_addr, 4});
|
||||||
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << data_read);
|
||||||
switch (func3) {
|
switch (func3) {
|
||||||
case 0:
|
case 0:
|
||||||
// LBI
|
// LBI
|
||||||
@@ -465,10 +465,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
case 6: {
|
case 6: {
|
||||||
// load word and unit strided (not checking for unit stride)
|
// load word and unit strided (not checking for unit stride)
|
||||||
for (int i = 0; i < vl_; i++) {
|
for (int i = 0; i < vl_; i++) {
|
||||||
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
|
Word mem_addr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
|
||||||
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr);
|
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr);
|
||||||
Word data_read = core_->dcache_read(memAddr, 4);
|
Word data_read = core_->dcache_read(mem_addr, 4);
|
||||||
DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
|
DP(4, "Mem addr: " << std::hex << mem_addr << " Data read " << data_read);
|
||||||
int *result_ptr = (int *)(vd.data() + i);
|
int *result_ptr = (int *)(vd.data() + i);
|
||||||
*result_ptr = data_read;
|
*result_ptr = data_read;
|
||||||
}
|
}
|
||||||
@@ -490,21 +490,21 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
for (int t = 0; t < num_threads; ++t) {
|
for (int t = 0; t < num_threads; ++t) {
|
||||||
if (!tmask_.test(t))
|
if (!tmask_.test(t))
|
||||||
continue;
|
continue;
|
||||||
Word memAddr = rsdata[t][0] + immsrc;
|
Word mem_addr = rsdata[t][0] + immsrc;
|
||||||
trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)});
|
trace->mem_addrs.at(t).push_back({mem_addr, (1u << func3)});
|
||||||
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr);
|
||||||
switch (func3) {
|
switch (func3) {
|
||||||
case 0:
|
case 0:
|
||||||
// SB
|
// SB
|
||||||
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
|
core_->dcache_write(mem_addr, rsdata[t][1] & 0x000000FF, 1);
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
// SH
|
// SH
|
||||||
core_->dcache_write(memAddr, rsdata[t][1], 2);
|
core_->dcache_write(mem_addr, rsdata[t][1], 2);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
// SW
|
// SW
|
||||||
core_->dcache_write(memAddr, rsdata[t][1], 4);
|
core_->dcache_write(mem_addr, rsdata[t][1], 4);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
std::abort();
|
std::abort();
|
||||||
@@ -512,14 +512,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < vl_; i++) {
|
for (int i = 0; i < vl_; i++) {
|
||||||
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
|
Word mem_addr = rsdata[i][0] + (i * vtype_.vsew / 8);
|
||||||
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr);
|
||||||
switch (instr.getVlsWidth()) {
|
switch (instr.getVlsWidth()) {
|
||||||
case 6: {
|
case 6: {
|
||||||
// store word and unit strided (not checking for unit stride)
|
// store word and unit strided (not checking for unit stride)
|
||||||
uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
|
uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
|
||||||
core_->dcache_write(memAddr, value, 4);
|
core_->dcache_write(mem_addr, value, 4);
|
||||||
DP(4, "store: " << memAddr << " value:" << value);
|
DP(4, "store: " << mem_addr << " value:" << value);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
std::abort();
|
std::abort();
|
||||||
@@ -888,8 +888,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
for (int t = 0; t < num_threads; ++t) {
|
for (int t = 0; t < num_threads; ++t) {
|
||||||
if (!tmask_.test(t))
|
if (!tmask_.test(t))
|
||||||
continue;
|
continue;
|
||||||
int addr = rsdata[t][0];
|
auto mem_addr = rsdata[t][0];
|
||||||
printf("*** PREFETCHED %d ***\n", addr);
|
trace->mem_addrs.at(t).push_back({mem_addr, 4});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
|||||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||||
|
|
||||||
OPTS ?= -n64
|
OPTS ?= -n32
|
||||||
|
|
||||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
|||||||
@@ -1,24 +1,43 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <vx_intrinsics.h>
|
#include <vx_intrinsics.h>
|
||||||
#include <vx_spawn.h>
|
#include <vx_spawn.h>
|
||||||
|
#include <vx_print.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
void kernel_body(int task_id, const kernel_arg_t* arg) {
|
#define BLOCK_SIZE 64
|
||||||
uint32_t count = arg->task_size;
|
|
||||||
int32_t* src0_ptr = (int32_t*)arg->src0_ptr;
|
|
||||||
int32_t* src1_ptr = (int32_t*)arg->src1_ptr;
|
|
||||||
int32_t* dst_ptr = (int32_t*)arg->dst_ptr;
|
|
||||||
|
|
||||||
|
void kernel_body(int task_id, kernel_arg_t* arg) {
|
||||||
|
uint32_t count = arg->task_size;
|
||||||
uint32_t offset = task_id * count;
|
uint32_t offset = task_id * count;
|
||||||
|
uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE;
|
||||||
|
|
||||||
|
int32_t* src0_ptr = (int32_t*)arg->src0_ptr + offset;
|
||||||
|
int32_t* src1_ptr = (int32_t*)arg->src1_ptr + offset;
|
||||||
|
int32_t* dst_ptr = (int32_t*)arg->dst_ptr + offset;
|
||||||
|
|
||||||
|
uint32_t src0_end = (uint32_t)(src0_ptr + count);
|
||||||
|
uint32_t src1_end = (uint32_t)(src1_ptr + count);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < count; ++i) {
|
for (uint32_t i = 0; i < count; ++i) {
|
||||||
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
|
dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
|
||||||
vx_prefetch((uint32_t)(src0_ptr) + offset + i);
|
|
||||||
vx_prefetch((uint32_t)(src1_ptr) + offset + i);
|
uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE;
|
||||||
|
uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4);
|
||||||
|
if (src0_mask == 0 && src0_next < src0_end) {
|
||||||
|
//vx_printf("src0_next=%d\n", src0_next);
|
||||||
|
vx_prefetch(src0_next);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE;
|
||||||
|
uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4);
|
||||||
|
if (src1_mask == 0 && src1_next < src1_end) {
|
||||||
|
//vx_printf("src1_next=%d\n", src1_next);
|
||||||
|
vx_prefetch(src1_next);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||||
vx_spawn_tasks(arg->num_tasks, kernel_body, arg);
|
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||||
}
|
}
|
||||||
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
|
|||||||
std::cout << "open device connection" << std::endl;
|
std::cout << "open device connection" << std::endl;
|
||||||
RT_CHECK(vx_dev_open(&device));
|
RT_CHECK(vx_dev_open(&device));
|
||||||
|
|
||||||
unsigned max_cores, max_warps, max_threads;
|
uint64_t max_cores, max_warps, max_threads;
|
||||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
|
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
|
||||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
|
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
|
||||||
|
|||||||
Reference in New Issue
Block a user