merging with master

This commit is contained in:
fares
2019-11-23 20:40:19 -05:00
16 changed files with 29131 additions and 415 deletions

View File

@@ -131,6 +131,7 @@ void _clCmdParams(int argc, char* argv[]){
// devices have no relationship with context // devices have no relationship with context
void _clInit() void _clInit()
{ {
printf("_clInit()\n");
int DEVICE_ID_INUSED = device_id_inused; int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL; cl_int resultCL;
@@ -225,15 +226,18 @@ void _clInit()
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//----------------------------------------------- //-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object //--cambine-5: Load CL file, build CL program object, create CL kernel object
std::string source_str = FileToString(kernel_file); /*std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str(); const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() }; size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithSource(oclHandles.context, oclHandles.program =
clCreateProgramWithBuiltInKernels(oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], "BFS_1, BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1, 1,
&source, &source,
sourceSize, sourceSize,
&resultCL); &resultCL);*/
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)")); throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));

View File

@@ -1,33 +1,35 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=bfs PROJECT=bfs
@@ -37,7 +39,10 @@ lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a $(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -45,8 +50,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

28677
benchmarks/opencl/bfs/graph4096.txt Executable file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -1,10 +1,12 @@
//--by Jianbin Fang //--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <cstring> #include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING #ifdef PROFILING
#include "timer.h" #include "timer.h"
@@ -15,95 +17,96 @@
#define MAX_THREADS_PER_BLOCK 256 #define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information // Structure to hold a node information
struct Node struct Node {
{
int starting; int starting;
int no_of_edges; int no_of_edges;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
//--bfs on cpu //--bfs on cpu
//--programmer: jianbin //--programmer: jianbin
//--date: 26/01/2011 //--date: 26/01/2011
//--note: width is changed to the new_width //--note: width is changed to the new_width
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost_ref){ char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost_ref) {
char stop; char stop;
int k = 0; int k = 0;
do{ do {
//if no thread changes this value then the loop stops // if no thread changes this value then the loop stops
stop=false; stop = false;
for(int tid = 0; tid < no_of_nodes; tid++ ) for (int tid = 0; tid < no_of_nodes; tid++) {
{ if (h_graph_mask[tid] == true) {
if (h_graph_mask[tid] == true){ h_graph_mask[tid] = false;
h_graph_mask[tid]=false; for (int i = h_graph_nodes[tid].starting;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid i++) {
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below int id =
h_cost_ref[id]=h_cost_ref[tid]+1; h_graph_edges[i]; //--cambine: node id is connected with node tid
h_updating_graph_mask[id]=true; if (!h_graph_visited[id]) { //--cambine: if node id has not been
//visited, enter the body below
h_cost_ref[id] = h_cost_ref[tid] + 1;
h_updating_graph_mask[id] = true;
} }
} }
} }
} }
for(int tid=0; tid< no_of_nodes ; tid++ ) for (int tid = 0; tid < no_of_nodes; tid++) {
{ if (h_updating_graph_mask[tid] == true) {
if (h_updating_graph_mask[tid] == true){ h_graph_mask[tid] = true;
h_graph_mask[tid]=true; h_graph_visited[tid] = true;
h_graph_visited[tid]=true; stop = true;
stop=true; h_updating_graph_mask[tid] = false;
h_updating_graph_mask[tid]=false;
} }
} }
k++; k++;
} } while (stop);
while(stop);
} }
//---------------------------------------------------------- //----------------------------------------------------------
//--breadth first search on GPUs //--breadth first search on GPUs
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost) char *h_updating_graph_mask, char *h_graph_visited,
throw(std::string){ int *h_cost) throw(std::string) {
//int number_elements = height*width; // int number_elements = height*width;
char h_over; char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over; d_graph_visited, d_cost, d_over;
try{ try {
//--1 transfer data from host to device //--1 transfer data from host to device
_clInit(); _clInit();
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_updating_graph_mask =
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over); d_over = _clMallocRW(sizeof(char), &h_over);
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); h_updating_graph_mask);
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); _clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
//--2 invoke kernel //--2 invoke kernel
#ifdef PROFILING #ifdef PROFILING
timer kernel_timer; timer kernel_timer;
double kernel_time = 0.0; double kernel_time = 0.0;
kernel_timer.reset(); kernel_timer.reset();
kernel_timer.start(); kernel_timer.start();
#endif #endif
do{ do {
h_over = false; h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over); _clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0 //--kernel 0
@@ -117,7 +120,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
_clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes; // int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1 //--kernel 1
@@ -129,11 +132,11 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
_clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes; // work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over); _clMemcpyD2H(d_over, sizeof(char), &h_over);
}while(h_over); } while (h_over);
_clFinish(); _clFinish();
#ifdef PROFILING #ifdef PROFILING
@@ -141,10 +144,10 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
kernel_time = kernel_timer.getTimeInSeconds(); kernel_time = kernel_timer.getTimeInSeconds();
#endif #endif
//--3 transfer data from device to host //--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); _clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics //--statistics
#ifdef PROFILING #ifdef PROFILING
std::cout<<"kernel time(s):"<<kernel_time<<std::endl; std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif #endif
//--4 release cl resources. //--4 release cl resources.
_clFree(d_graph_nodes); _clFree(d_graph_nodes);
@@ -155,8 +158,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
_clFree(d_cost); _clFree(d_cost);
_clFree(d_over); _clFree(d_over);
_clRelease(); _clRelease();
} } catch (std::string msg) {
catch(std::string msg){
_clFree(d_graph_nodes); _clFree(d_graph_nodes);
_clFree(d_graph_edges); _clFree(d_graph_edges);
_clFree(d_graph_mask); _clFree(d_graph_mask);
@@ -169,126 +171,122 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
e_str += msg; e_str += msg;
throw(e_str); throw(e_str);
} }
return ; return;
} }
void Usage(int argc, char**argv){
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
}
//---------------------------------------------------------- //----------------------------------------------------------
//--cambine: main function //--cambine: main function
//--author: created by Jianbin Fang //--author: created by Jianbin Fang
//--date: 25/01/2011 //--date: 25/01/2011
//---------------------------------------------------------- //----------------------------------------------------------
int main(int argc, char * argv[]) int main(int argc, char *argv[]) {
{ printf("enter demo main\n");
int no_of_nodes; int no_of_nodes;
int edge_list_size; int edge_list_size;
FILE *fp; FILE *fp;
Node* h_graph_nodes; Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try{
char *input_f;
if(argc!=2){
Usage(argc, argv);
exit(0);
}
input_f = argv[1]; try {
char *input_f = "graph4096.txt";
printf("Reading File\n"); printf("Reading File\n");
//Read in Graph from a file // Read in Graph from a file
fp = fopen(input_f,"r"); fp = fopen(input_f, "r");
if(!fp){ if (!fp) {
printf("Error Reading graph file\n"); printf("Error Reading graph file\n");
return 0; return 0;
} }
printf("Reading File completed!\n");
int source = 0; int source = 0;
fscanf(fp,"%d",&no_of_nodes); fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1; int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes; int num_of_threads_per_block = no_of_nodes;
//Make execution Parameters according to the number of nodes // Make execution Parameters according to the number of nodes
//Distribute threads across multiple Blocks if necessary // Distribute threads across multiple Blocks if necessary
if(no_of_nodes>MAX_THREADS_PER_BLOCK){ if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK; num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
} }
work_group_size = num_of_threads_per_block; work_group_size = num_of_threads_per_block;
// allocate host memory // allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes); h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
int start, edgeno; int start, edgeno;
// initalize the memory // initalize the memory
for(int i = 0; i < no_of_nodes; i++){ for (int i = 0; i < no_of_nodes; i++) {
fscanf(fp,"%d %d",&start,&edgeno); fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start; h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno; h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i]=false; h_graph_mask[i] = false;
h_updating_graph_mask[i]=false; h_updating_graph_mask[i] = false;
h_graph_visited[i]=false; h_graph_visited[i] = false;
} }
//read the source node from the file // read the source node from the file
fscanf(fp,"%d",&source); fscanf(fp, "%d", &source);
source=0; source = 0;
//set the source node as true in the mask // set the source node as true in the mask
h_graph_mask[source]=true; h_graph_mask[source] = true;
h_graph_visited[source]=true; h_graph_visited[source] = true;
fscanf(fp,"%d",&edge_list_size); fscanf(fp, "%d", &edge_list_size);
int id,cost; int id, cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size); int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for(int i=0; i < edge_list_size ; i++){ for (int i = 0; i < edge_list_size; i++) {
fscanf(fp,"%d",&id); fscanf(fp, "%d", &id);
fscanf(fp,"%d",&cost); fscanf(fp, "%d", &cost);
h_graph_edges[i] = id; h_graph_edges[i] = id;
} }
if(fp) if (fp)
fclose(fp); fclose(fp);
// allocate mem for the result on host side // allocate mem for the result on host side
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
for(int i=0;i<no_of_nodes;i++){ for (int i = 0; i < no_of_nodes; i++) {
h_cost[i]=-1; h_cost[i] = -1;
h_cost_ref[i] = -1; h_cost_ref[i] = -1;
} }
h_cost[source]=0; h_cost[source] = 0;
h_cost_ref[source]=0; h_cost_ref[source] = 0;
//--------------------------------------------------------- //---------------------------------------------------------
//--gpu entry //--gpu entry
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost); run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//--------------------------------------------------------- //---------------------------------------------------------
//--cpu entry //--cpu entry
// initalize the memory again // initalize the memory again
for(int i = 0; i < no_of_nodes; i++){ for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i]=false; h_graph_mask[i] = false;
h_updating_graph_mask[i]=false; h_updating_graph_mask[i] = false;
h_graph_visited[i]=false; h_graph_visited[i] = false;
} }
//set the source node as true in the mask // set the source node as true in the mask
source=0; source = 0;
h_graph_mask[source]=true; h_graph_mask[source] = true;
h_graph_visited[source]=true; h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref); run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//--------------------------------------------------------- //---------------------------------------------------------
//--result varification //--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes); compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory // release host memory
free(h_graph_nodes); free(h_graph_nodes);
free(h_graph_mask); free(h_graph_mask);
free(h_updating_graph_mask); free(h_updating_graph_mask);
free(h_graph_visited); free(h_graph_visited);
} } catch (std::string msg) {
catch(std::string msg){ std::cout << "--cambine: exception in main ->" << msg << std::endl;
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl; // release host memory
//release host memory
free(h_graph_nodes); free(h_graph_nodes);
free(h_graph_mask); free(h_graph_mask);
free(h_updating_graph_mask); free(h_updating_graph_mask);

View File

@@ -3,9 +3,8 @@
#include <iostream> #include <iostream>
class timer { class timer {
public: public:
timer(const char *name = 0); timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit); timer(const char *name, std::ostream &write_on_exit);
@@ -17,7 +16,7 @@ class timer {
double getTimeInSeconds(); double getTimeInSeconds();
private: private:
void print_time(std::ostream &, const char *which, double time) const; void print_time(std::ostream &, const char *which, double time) const;
union { union {
@@ -38,91 +37,65 @@ class timer {
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
}; };
std::ostream &operator<<(std::ostream &, class timer &);
std::ostream &operator << (std::ostream &, class timer &); inline void timer::reset() {
inline void timer::reset()
{
total_time = 0; total_time = 0;
count = 0; count = 0;
} }
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
inline timer::timer(const char *name)
:
name(name),
write_on_exit(0)
{
reset(); reset();
} }
inline timer::timer(const char *name, std::ostream &write_on_exit) inline timer::timer(const char *name, std::ostream &write_on_exit)
: : name(name), write_on_exit(&write_on_exit) {
name(name),
write_on_exit(&write_on_exit)
{
reset(); reset();
} }
inline timer::~timer() {
inline timer::~timer()
{
if (write_on_exit != 0) if (write_on_exit != 0)
print(*write_on_exit); print(*write_on_exit);
} }
inline void timer::start() {
inline void timer::start()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long) edx << 32) + eax; total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t"
"subl %%eax, %0\n\t" "subl %%eax, %0\n\t"
"sbbl %%edx, %1" "sbbl %%edx, %1"
: "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#else #else
#error Compiler/Architecture not recognized #error Compiler/Architecture not recognized
#endif #endif
} }
inline void timer::stop() {
inline void timer::stop()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long) edx << 32) + eax; total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t"
"addl %%eax, %0\n\t" "addl %%eax, %0\n\t"
"adcl %%edx, %1" "adcl %%edx, %1"
: "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#endif #endif
++ count; ++count;
} }
#endif #endif

View File

@@ -1,44 +1,60 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=kmeans PROJECT=kmeans
PROJECT=saxpy
all: $(PROJECT).dump $(PROJECT).hex all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a kmeans_clustering.o: kmeans_clustering.c
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -46,8 +62,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -34,8 +34,8 @@ HEX: ELF
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex $(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
ELF: ELF:
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~ # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~

View File

@@ -6,14 +6,17 @@
int main() int main()
{ {
vx_tmc(1); vx_tmc(1);
int n = 5; int n = 5;
int scalar = 10;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; } for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
#if 1 #if 0
//--------------------------------------------------------------- //---------------------------------------------------------------
/* vvaddint32 /* vvaddint32
* # vector-vector add routine of 32-bit integers * # vector-vector add routine of 32-bit integers
@@ -43,7 +46,6 @@ int main()
/* # vector-scalar add /* # vector-scalar add
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */ # for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;} for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
int scalar = 10;
printf("vsadd...scalar:%d\na[%d]: ", scalar, n); printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]); for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb: %d", scalar); printf("\nb: %d", scalar);
@@ -78,10 +80,12 @@ int main()
if(a[i] != b[i]) if(a[i] != b[i])
{ {
printf("\n<memcpy> failed at <index: %d>! \n", i); printf("\n<memcpy> failed at <index: %d>! \n", i);
return; return 1;
} }
} }
printf("\nPASSED.......................... <memcpy> \n"); printf("\nPASSED.......................... <memcpy> \n");
#endif
#if 1
//--------------------------------------------------------------- //---------------------------------------------------------------
/* # void saxpy(size_t n, const float a, const float *x, float *y) /* # void saxpy(size_t n, const float a, const float *x, float *y)
# ==> convert to int!! # ==> convert to int!!
@@ -99,16 +103,22 @@ int main()
vx_vec_saxpy(n, scalar, a, b); vx_vec_saxpy(n, scalar, a, b);
printf("saxpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
for(int i = 0; i < n; ++i) for(int i = 0; i < n; ++i)
{ {
if(b[i] != ((a[i] * scalar) + c[i])) if(b[i] != ((a[i] * scalar) + c[i]))
{ {
printf("\n<saxpy> failed at <index: %d>! \n", i); printf("\n<saxpy> failed at <index: %d>! \n", i);
return; return 1;
} }
} }
printf("\nPASSED.......................... <saxpy> \n"); printf("\nPASSED.......................... <saxpy> \n");
#endif
#if 0
//--------------------------------------------------------------- //---------------------------------------------------------------
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix /* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
# size_t lda, const float*b, // k * n matrix # size_t lda, const float*b, // k * n matrix

View File

@@ -5,10 +5,10 @@
extern "C" { extern "C" {
#endif #endif
void vx_vec_vvaddint32(int n, int* a, int* b, int *c); //void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
//void vx_vec_vsadd(int n, int* a, int scalar); //void vx_vec_vsadd(int n, int* a, int scalar);
//void vx_vec_memcpy(int* a, int* b, int n); //void vx_vec_memcpy(int* a, int* b, int n);
//void vx_vec_saxpy(int n, int scalar, int* a, int* b); void vx_vec_saxpy(int n, int scalar, int* a, int* b);
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc); //void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -13,16 +13,31 @@
# fa0 a # fa0 a
# a1 x # a1 x
# a2 y # a2 y
#vx_vec_saxpy:
# vsetvli a4, a0, e32, m8
#saxpy:
# vlw.v v0, (a1)
# sub a0, a0, a4
# slli a4, a4, 2
# add a1, a1, a4
# vlw.v v8, (a2)
# vfmacc.vf v8, fa0, v0
# vsw.v v8, (a2)
# add a2, a2, a4
# bnez a0, saxpy
# ret
# a0 n, rs1 a, a2 x, a3 y
vx_vec_saxpy: vx_vec_saxpy:
vsetvli a4, a0, e32, m8 vsetvli a4, a0, e32, m8
saxpy: saxpy:
vlw.v v0, (a1) vlw.v v0, (a2)
sub a0, a0, a4 sub a0, a0, a4
slli a4, a4, 2 slli a4, a4, 2
add a1, a1, a4
vlw.v v8, (a2)
vfmacc.vf v8, fa0, v0
vsw.v v8, (a2)
add a2, a2, a4 add a2, a2, a4
vlw.v v1, (a3)
vmacc.vx v1, rs1, v0
vsw.v v1, (a3)
add a3, a3, a4
bnez a0, saxpy bnez a0, saxpy
ret ret