Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
#include <string.h>
|
||||
#include <vortex.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include "common.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
@@ -22,8 +23,8 @@ int test = -1;
|
||||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
kernel_arg_t kernel_arg;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
@@ -56,9 +57,6 @@ static void parse_args(int argc, char **argv) {
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (staging_buf) {
|
||||
vx_buf_free(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
@@ -77,15 +75,15 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
||||
|
||||
int num_blocks_8 = (64 * num_blocks) / 8;
|
||||
|
||||
// update source buffer
|
||||
// update source buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
|
||||
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < num_blocks; ++i) {
|
||||
std::cout << "data[" << i << "]=0x";
|
||||
for (int j = 7; j >= 0; --j) {
|
||||
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
|
||||
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}*/
|
||||
@@ -93,24 +91,24 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
||||
// write source buffer to local memory
|
||||
std::cout << "write source buffer to local memory" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
|
||||
((uint64_t*)staging_buf.data())[i] = 0;
|
||||
}
|
||||
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
|
||||
auto curr = ((uint64_t*)staging_buf.data())[i];
|
||||
auto ref = shuffle(i, value);
|
||||
if (curr != ref) {
|
||||
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
|
||||
@@ -147,44 +145,44 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
||||
|
||||
// update source buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
}
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// start device
|
||||
std::cout << "start execution" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_start(device));
|
||||
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t4 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
auto t5 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
|
||||
int32_t curr = ((int32_t*)staging_buf.data())[i];
|
||||
int32_t ref = i;
|
||||
if (curr != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
@@ -215,9 +213,6 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
size_t value;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
@@ -228,10 +223,11 @@ int main(int argc, char *argv[]) {
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t max_cores;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
uint32_t num_points = count;
|
||||
|
||||
uint64_t num_cores;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
|
||||
uint32_t num_points = count * num_cores;
|
||||
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
|
||||
uint32_t buf_size = num_blocks * 64;
|
||||
|
||||
@@ -239,20 +235,19 @@ int main(int argc, char *argv[]) {
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// allocate device memory
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
|
||||
kernel_arg.src_addr = value;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
|
||||
kernel_arg.dst_addr = value;
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.count = num_points;
|
||||
|
||||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
staging_buf.resize(alloc_size);
|
||||
|
||||
// run tests
|
||||
if (0 == test || -1 == test) {
|
||||
@@ -268,9 +263,9 @@ int main(int argc, char *argv[]) {
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
|
||||
auto buf_ptr = (void*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "run kernel test" << std::endl;
|
||||
|
||||
Reference in New Issue
Block a user