merging with master

This commit is contained in:
fares
2019-11-23 20:40:19 -05:00
16 changed files with 29131 additions and 415 deletions

View File

@@ -131,6 +131,7 @@ void _clCmdParams(int argc, char* argv[]){
// devices have no relationship with context // devices have no relationship with context
void _clInit() void _clInit()
{ {
printf("_clInit()\n");
int DEVICE_ID_INUSED = device_id_inused; int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL; cl_int resultCL;
@@ -225,15 +226,18 @@ void _clInit()
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//----------------------------------------------- //-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object //--cambine-5: Load CL file, build CL program object, create CL kernel object
std::string source_str = FileToString(kernel_file); /*std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str(); const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() }; size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithSource(oclHandles.context, oclHandles.program =
clCreateProgramWithBuiltInKernels(oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], "BFS_1, BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1, 1,
&source, &source,
sourceSize, sourceSize,
&resultCL); &resultCL);*/
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)")); throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));

View File

@@ -1,33 +1,35 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=bfs PROJECT=bfs
@@ -37,7 +39,10 @@ lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a $(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -45,8 +50,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

28677
benchmarks/opencl/bfs/graph4096.txt Executable file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -1,12 +1,14 @@
//--by Jianbin Fang //--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <cstring> #include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING #ifdef PROFILING
#include "timer.h" #include "timer.h"
#endif #endif
@@ -15,285 +17,281 @@
#define MAX_THREADS_PER_BLOCK 256 #define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information // Structure to hold a node information
struct Node struct Node {
{ int starting;
int starting; int no_of_edges;
int no_of_edges;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
//--bfs on cpu //--bfs on cpu
//--programmer: jianbin //--programmer: jianbin
//--date: 26/01/2011 //--date: 26/01/2011
//--note: width is changed to the new_width //--note: width is changed to the new_width
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost_ref){ char *h_updating_graph_mask, char *h_graph_visited,
char stop; int *h_cost_ref) {
int k = 0; char stop;
do{ int k = 0;
//if no thread changes this value then the loop stops do {
stop=false; // if no thread changes this value then the loop stops
for(int tid = 0; tid < no_of_nodes; tid++ ) stop = false;
{ for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_graph_mask[tid] == true){ if (h_graph_mask[tid] == true) {
h_graph_mask[tid]=false; h_graph_mask[tid] = false;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ for (int i = h_graph_nodes[tid].starting;
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below i++) {
h_cost_ref[id]=h_cost_ref[tid]+1; int id =
h_updating_graph_mask[id]=true; h_graph_edges[i]; //--cambine: node id is connected with node tid
} if (!h_graph_visited[id]) { //--cambine: if node id has not been
} //visited, enter the body below
} h_cost_ref[id] = h_cost_ref[tid] + 1;
} h_updating_graph_mask[id] = true;
}
}
}
}
for(int tid=0; tid< no_of_nodes ; tid++ ) for (int tid = 0; tid < no_of_nodes; tid++) {
{ if (h_updating_graph_mask[tid] == true) {
if (h_updating_graph_mask[tid] == true){ h_graph_mask[tid] = true;
h_graph_mask[tid]=true; h_graph_visited[tid] = true;
h_graph_visited[tid]=true; stop = true;
stop=true; h_updating_graph_mask[tid] = false;
h_updating_graph_mask[tid]=false; }
} }
} k++;
k++; } while (stop);
}
while(stop);
} }
//---------------------------------------------------------- //----------------------------------------------------------
//--breadth first search on GPUs //--breadth first search on GPUs
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost) char *h_updating_graph_mask, char *h_graph_visited,
throw(std::string){ int *h_cost) throw(std::string) {
//int number_elements = height*width; // int number_elements = height*width;
char h_over; char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over; d_graph_visited, d_cost, d_over;
try{ try {
//--1 transfer data from host to device //--1 transfer data from host to device
_clInit(); _clInit();
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_updating_graph_mask =
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); h_updating_graph_mask);
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel
#ifdef PROFILING
//--2 invoke kernel timer kernel_timer;
#ifdef PROFILING double kernel_time = 0.0;
timer kernel_timer; kernel_timer.reset();
double kernel_time = 0.0; kernel_timer.start();
kernel_timer.reset();
kernel_timer.start();
#endif #endif
do{ do {
h_over = false; h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over); _clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0 //--kernel 0
int kernel_id = 0; int kernel_id = 0;
int kernel_idx = 0; int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes; // int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1 //--kernel 1
kernel_id = 1; kernel_id = 1;
kernel_idx = 0; kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes; // work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over); _clMemcpyD2H(d_over, sizeof(char), &h_over);
}while(h_over); } while (h_over);
_clFinish(); _clFinish();
#ifdef PROFILING #ifdef PROFILING
kernel_timer.stop(); kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds(); kernel_time = kernel_timer.getTimeInSeconds();
#endif #endif
//--3 transfer data from device to host //--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); _clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics //--statistics
#ifdef PROFILING #ifdef PROFILING
std::cout<<"kernel time(s):"<<kernel_time<<std::endl; std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif #endif
//--4 release cl resources. //--4 release cl resources.
_clFree(d_graph_nodes); _clFree(d_graph_nodes);
_clFree(d_graph_edges); _clFree(d_graph_edges);
_clFree(d_graph_mask); _clFree(d_graph_mask);
_clFree(d_updating_graph_mask); _clFree(d_updating_graph_mask);
_clFree(d_graph_visited); _clFree(d_graph_visited);
_clFree(d_cost); _clFree(d_cost);
_clFree(d_over); _clFree(d_over);
_clRelease(); _clRelease();
} } catch (std::string msg) {
catch(std::string msg){ _clFree(d_graph_nodes);
_clFree(d_graph_nodes); _clFree(d_graph_edges);
_clFree(d_graph_edges); _clFree(d_graph_mask);
_clFree(d_graph_mask); _clFree(d_updating_graph_mask);
_clFree(d_updating_graph_mask); _clFree(d_graph_visited);
_clFree(d_graph_visited); _clFree(d_cost);
_clFree(d_cost); _clFree(d_over);
_clFree(d_over); _clRelease();
_clRelease(); std::string e_str = "in run_transpose_gpu -> ";
std::string e_str = "in run_transpose_gpu -> "; e_str += msg;
e_str += msg; throw(e_str);
throw(e_str); }
} return;
return ;
} }
void Usage(int argc, char**argv){
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
}
//---------------------------------------------------------- //----------------------------------------------------------
//--cambine: main function //--cambine: main function
//--author: created by Jianbin Fang //--author: created by Jianbin Fang
//--date: 25/01/2011 //--date: 25/01/2011
//---------------------------------------------------------- //----------------------------------------------------------
int main(int argc, char * argv[]) int main(int argc, char *argv[]) {
{ printf("enter demo main\n");
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node* h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try{
char *input_f;
if(argc!=2){
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
//Read in Graph from a file
fp = fopen(input_f,"r");
if(!fp){
printf("Error Reading graph file\n");
return 0;
}
int source = 0; int no_of_nodes;
int edge_list_size;
FILE *fp;
Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
fscanf(fp,"%d",&no_of_nodes); try {
char *input_f = "graph4096.txt";
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return 0;
}
int num_of_blocks = 1; printf("Reading File completed!\n");
int num_of_threads_per_block = no_of_nodes;
//Make execution Parameters according to the number of nodes int source = 0;
//Distribute threads across multiple Blocks if necessary
if(no_of_nodes>MAX_THREADS_PER_BLOCK){
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
int start, edgeno;
// initalize the memory
for(int i = 0; i < no_of_nodes; i++){
fscanf(fp,"%d %d",&start,&edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//read the source node from the file
fscanf(fp,"%d",&source);
source=0;
//set the source node as true in the mask
h_graph_mask[source]=true;
h_graph_visited[source]=true;
fscanf(fp,"%d",&edge_list_size);
int id,cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
for(int i=0; i < edge_list_size ; i++){
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i] = id;
}
if(fp) fscanf(fp, "%d", &no_of_nodes);
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
for(int i=0;i<no_of_nodes;i++){
h_cost[i]=-1;
h_cost_ref[i] = -1;
}
h_cost[source]=0;
h_cost_ref[source]=0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for(int i = 0; i < no_of_nodes; i++){
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//set the source node as true in the mask
source=0;
h_graph_mask[source]=true;
h_graph_visited[source]=true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} int num_of_blocks = 1;
catch(std::string msg){ int num_of_threads_per_block = no_of_nodes;
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl;
//release host memory // Make execution Parameters according to the number of nodes
free(h_graph_nodes); // Distribute threads across multiple Blocks if necessary
free(h_graph_mask); if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
free(h_updating_graph_mask); num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
free(h_graph_visited); num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
} }
work_group_size = num_of_threads_per_block;
return 0; // allocate host memory
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++) {
h_cost[i] = -1;
h_cost_ref[i] = -1;
}
h_cost[source] = 0;
h_cost_ref[source] = 0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// set the source node as true in the mask
source = 0;
h_graph_mask[source] = true;
h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
} }

View File

@@ -3,126 +3,99 @@
#include <iostream> #include <iostream>
class timer { class timer {
public: public:
timer(const char *name = 0); timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit); timer(const char *name, std::ostream &write_on_exit);
~timer(); ~timer();
void start(), stop(); void start(), stop();
void reset(); void reset();
std::ostream &print(std::ostream &); std::ostream &print(std::ostream &);
double getTimeInSeconds(); double getTimeInSeconds();
private: private:
void print_time(std::ostream &, const char *which, double time) const; void print_time(std::ostream &, const char *which, double time) const;
union { union {
long long total_time; long long total_time;
struct { struct {
#if defined __PPC__ #if defined __PPC__
int high, low; int high, low;
#else #else
int low, high; int low, high;
#endif #endif
}; };
}; };
unsigned long long count; unsigned long long count;
const char *const name; const char *const name;
std::ostream *const write_on_exit; std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
}; };
std::ostream &operator<<(std::ostream &, class timer &);
std::ostream &operator << (std::ostream &, class timer &); inline void timer::reset() {
total_time = 0;
count = 0;
inline void timer::reset()
{
total_time = 0;
count = 0;
} }
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
inline timer::timer(const char *name) reset();
:
name(name),
write_on_exit(0)
{
reset();
} }
inline timer::timer(const char *name, std::ostream &write_on_exit) inline timer::timer(const char *name, std::ostream &write_on_exit)
: : name(name), write_on_exit(&write_on_exit) {
name(name), reset();
write_on_exit(&write_on_exit)
{
reset();
} }
inline timer::~timer() {
inline timer::~timer() if (write_on_exit != 0)
{ print(*write_on_exit);
if (write_on_exit != 0)
print(*write_on_exit);
} }
inline void timer::start() {
inline void timer::start()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long) edx << 32) + eax; total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t" "subl %%eax, %0\n\t"
"subl %%eax, %0\n\t" "sbbl %%edx, %1"
"sbbl %%edx, %1" : "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#else #else
#error Compiler/Architecture not recognized #error Compiler/Architecture not recognized
#endif #endif
} }
inline void timer::stop() {
inline void timer::stop()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long) edx << 32) + eax; total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t" "addl %%eax, %0\n\t"
"addl %%eax, %0\n\t" "adcl %%edx, %1"
"adcl %%edx, %1" : "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#endif #endif
++ count; ++count;
} }
#endif #endif

View File

@@ -1,44 +1,60 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=kmeans PROJECT=kmeans
PROJECT=saxpy
all: $(PROJECT).dump $(PROJECT).hex all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a kmeans_clustering.o: kmeans_clustering.c
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -46,8 +62,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex rm -rf *.elf *.dump *.hex

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -34,8 +34,8 @@ HEX: ELF
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex $(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
ELF: ELF:
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~ # $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~

View File

@@ -6,14 +6,17 @@
int main() int main()
{ {
vx_tmc(1); vx_tmc(1);
int n = 5; int n = 5;
int scalar = 10;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1}; int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; } for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
#if 1 #if 0
//--------------------------------------------------------------- //---------------------------------------------------------------
/* vvaddint32 /* vvaddint32
* # vector-vector add routine of 32-bit integers * # vector-vector add routine of 32-bit integers
@@ -43,7 +46,6 @@ int main()
/* # vector-scalar add /* # vector-scalar add
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */ # for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;} for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
int scalar = 10;
printf("vsadd...scalar:%d\na[%d]: ", scalar, n); printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]); for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb: %d", scalar); printf("\nb: %d", scalar);
@@ -78,10 +80,12 @@ int main()
if(a[i] != b[i]) if(a[i] != b[i])
{ {
printf("\n<memcpy> failed at <index: %d>! \n", i); printf("\n<memcpy> failed at <index: %d>! \n", i);
return; return 1;
} }
} }
printf("\nPASSED.......................... <memcpy> \n"); printf("\nPASSED.......................... <memcpy> \n");
#endif
#if 1
//--------------------------------------------------------------- //---------------------------------------------------------------
/* # void saxpy(size_t n, const float a, const float *x, float *y) /* # void saxpy(size_t n, const float a, const float *x, float *y)
# ==> convert to int!! # ==> convert to int!!
@@ -99,16 +103,22 @@ int main()
vx_vec_saxpy(n, scalar, a, b); vx_vec_saxpy(n, scalar, a, b);
printf("saxpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
for(int i = 0; i < n; ++i) for(int i = 0; i < n; ++i)
{ {
if(b[i] != ((a[i] * scalar) + c[i])) if(b[i] != ((a[i] * scalar) + c[i]))
{ {
printf("\n<saxpy> failed at <index: %d>! \n", i); printf("\n<saxpy> failed at <index: %d>! \n", i);
return; return 1;
} }
} }
printf("\nPASSED.......................... <saxpy> \n"); printf("\nPASSED.......................... <saxpy> \n");
#endif
#if 0
//--------------------------------------------------------------- //---------------------------------------------------------------
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix /* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
# size_t lda, const float*b, // k * n matrix # size_t lda, const float*b, // k * n matrix

View File

@@ -5,10 +5,10 @@
extern "C" { extern "C" {
#endif #endif
void vx_vec_vvaddint32(int n, int* a, int* b, int *c); //void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
//void vx_vec_vsadd(int n, int* a, int scalar); //void vx_vec_vsadd(int n, int* a, int scalar);
//void vx_vec_memcpy(int* a, int* b, int n); //void vx_vec_memcpy(int* a, int* b, int n);
//void vx_vec_saxpy(int n, int scalar, int* a, int* b); void vx_vec_saxpy(int n, int scalar, int* a, int* b);
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc); //void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -13,16 +13,31 @@
# fa0 a # fa0 a
# a1 x # a1 x
# a2 y # a2 y
#vx_vec_saxpy:
# vsetvli a4, a0, e32, m8
#saxpy:
# vlw.v v0, (a1)
# sub a0, a0, a4
# slli a4, a4, 2
# add a1, a1, a4
# vlw.v v8, (a2)
# vfmacc.vf v8, fa0, v0
# vsw.v v8, (a2)
# add a2, a2, a4
# bnez a0, saxpy
# ret
# a0 n, rs1 a, a2 x, a3 y
vx_vec_saxpy: vx_vec_saxpy:
vsetvli a4, a0, e32, m8 vsetvli a4, a0, e32, m8
saxpy: saxpy:
vlw.v v0, (a1) vlw.v v0, (a2)
sub a0, a0, a4 sub a0, a0, a4
slli a4, a4, 2 slli a4, a4, 2
add a1, a1, a4
vlw.v v8, (a2)
vfmacc.vf v8, fa0, v0
vsw.v v8, (a2)
add a2, a2, a4 add a2, a2, a4
vlw.v v1, (a3)
vmacc.vx v1, rs1, v0
vsw.v v1, (a3)
add a3, a3, a4
bnez a0, saxpy bnez a0, saxpy
ret ret