This commit is contained in:
Blaise Tine
2019-11-23 08:36:00 -05:00
parent ef8c512ebe
commit 76417b0561
9 changed files with 404 additions and 396 deletions

View File

@@ -1,33 +1,35 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=bfs PROJECT=bfs
@@ -37,7 +39,10 @@ lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a $(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -45,8 +50,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

Binary file not shown.

View File

@@ -1,12 +1,14 @@
//--by Jianbin Fang //--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <cstring> #include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING #ifdef PROFILING
#include "timer.h" #include "timer.h"
#endif #endif
@@ -15,285 +17,279 @@
#define MAX_THREADS_PER_BLOCK 256 #define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information // Structure to hold a node information
struct Node struct Node {
{ int starting;
int starting; int no_of_edges;
int no_of_edges;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
//--bfs on cpu //--bfs on cpu
//--programmer: jianbin //--programmer: jianbin
//--date: 26/01/2011 //--date: 26/01/2011
//--note: width is changed to the new_width //--note: width is changed to the new_width
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost_ref){ char *h_updating_graph_mask, char *h_graph_visited,
char stop; int *h_cost_ref) {
int k = 0; char stop;
do{ int k = 0;
//if no thread changes this value then the loop stops do {
stop=false; // if no thread changes this value then the loop stops
for(int tid = 0; tid < no_of_nodes; tid++ ) stop = false;
{ for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_graph_mask[tid] == true){ if (h_graph_mask[tid] == true) {
h_graph_mask[tid]=false; h_graph_mask[tid] = false;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ for (int i = h_graph_nodes[tid].starting;
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below i++) {
h_cost_ref[id]=h_cost_ref[tid]+1; int id =
h_updating_graph_mask[id]=true; h_graph_edges[i]; //--cambine: node id is connected with node tid
} if (!h_graph_visited[id]) { //--cambine: if node id has not been
} //visited, enter the body below
} h_cost_ref[id] = h_cost_ref[tid] + 1;
} h_updating_graph_mask[id] = true;
}
}
}
}
for(int tid=0; tid< no_of_nodes ; tid++ ) for (int tid = 0; tid < no_of_nodes; tid++) {
{ if (h_updating_graph_mask[tid] == true) {
if (h_updating_graph_mask[tid] == true){ h_graph_mask[tid] = true;
h_graph_mask[tid]=true; h_graph_visited[tid] = true;
h_graph_visited[tid]=true; stop = true;
stop=true; h_updating_graph_mask[tid] = false;
h_updating_graph_mask[tid]=false; }
} }
} k++;
k++; } while (stop);
}
while(stop);
} }
//---------------------------------------------------------- //----------------------------------------------------------
//--breadth first search on GPUs //--breadth first search on GPUs
//---------------------------------------------------------- //----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ int *h_graph_edges, char *h_graph_mask,
char *h_graph_visited, int *h_cost) char *h_updating_graph_mask, char *h_graph_visited,
throw(std::string){ int *h_cost) throw(std::string) {
//int number_elements = height*width; // int number_elements = height*width;
char h_over; char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over; d_graph_visited, d_cost, d_over;
try{ try {
//--1 transfer data from host to device //--1 transfer data from host to device
_clInit(); _clInit();
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_updating_graph_mask =
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); //--2 invoke kernel
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); #ifdef PROFILING
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); timer kernel_timer;
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); double kernel_time = 0.0;
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); kernel_timer.reset();
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); kernel_timer.start();
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif #endif
do{ do {
h_over = false; h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over); _clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0 //--kernel 0
int kernel_id = 0; int kernel_id = 0;
int kernel_idx = 0; int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes; // int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1 //--kernel 1
kernel_id = 1; kernel_id = 1;
kernel_idx = 0; kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes; // work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over); _clMemcpyD2H(d_over, sizeof(char), &h_over);
}while(h_over); } while (h_over);
_clFinish(); _clFinish();
#ifdef PROFILING #ifdef PROFILING
kernel_timer.stop(); kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds(); kernel_time = kernel_timer.getTimeInSeconds();
#endif #endif
//--3 transfer data from device to host //--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); _clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics //--statistics
#ifdef PROFILING #ifdef PROFILING
std::cout<<"kernel time(s):"<<kernel_time<<std::endl; std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif #endif
//--4 release cl resources. //--4 release cl resources.
_clFree(d_graph_nodes); _clFree(d_graph_nodes);
_clFree(d_graph_edges); _clFree(d_graph_edges);
_clFree(d_graph_mask); _clFree(d_graph_mask);
_clFree(d_updating_graph_mask); _clFree(d_updating_graph_mask);
_clFree(d_graph_visited); _clFree(d_graph_visited);
_clFree(d_cost); _clFree(d_cost);
_clFree(d_over); _clFree(d_over);
_clRelease(); _clRelease();
} } catch (std::string msg) {
catch(std::string msg){ _clFree(d_graph_nodes);
_clFree(d_graph_nodes); _clFree(d_graph_edges);
_clFree(d_graph_edges); _clFree(d_graph_mask);
_clFree(d_graph_mask); _clFree(d_updating_graph_mask);
_clFree(d_updating_graph_mask); _clFree(d_graph_visited);
_clFree(d_graph_visited); _clFree(d_cost);
_clFree(d_cost); _clFree(d_over);
_clFree(d_over); _clRelease();
_clRelease(); std::string e_str = "in run_transpose_gpu -> ";
std::string e_str = "in run_transpose_gpu -> "; e_str += msg;
e_str += msg; throw(e_str);
throw(e_str); }
} return;
return ;
} }
void Usage(int argc, char**argv){
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
}
//---------------------------------------------------------- //----------------------------------------------------------
//--cambine: main function //--cambine: main function
//--author: created by Jianbin Fang //--author: created by Jianbin Fang
//--date: 25/01/2011 //--date: 25/01/2011
//---------------------------------------------------------- //----------------------------------------------------------
int main(int argc, char * argv[]) int main(int argc, char *argv[]) {
{ printf("enter demo main\n");
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node* h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try{
char *input_f;
if(argc!=2){
Usage(argc, argv);
exit(0);
}
input_f = argv[1]; int no_of_nodes;
printf("Reading File\n"); int edge_list_size;
//Read in Graph from a file FILE *fp;
fp = fopen(input_f,"r"); Node *h_graph_nodes;
if(!fp){ char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
printf("Error Reading graph file\n");
return 0;
}
int source = 0; try {
char *input_f = "../data/bfs/graph1MW_6.txt";
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return 0;
}
fscanf(fp,"%d",&no_of_nodes); int source = 0;
int num_of_blocks = 1; fscanf(fp, "%d", &no_of_nodes);
int num_of_threads_per_block = no_of_nodes;
//Make execution Parameters according to the number of nodes int num_of_blocks = 1;
//Distribute threads across multiple Blocks if necessary int num_of_threads_per_block = no_of_nodes;
if(no_of_nodes>MAX_THREADS_PER_BLOCK){
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
int start, edgeno; // Make execution Parameters according to the number of nodes
// initalize the memory // Distribute threads across multiple Blocks if necessary
for(int i = 0; i < no_of_nodes; i++){ if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
fscanf(fp,"%d %d",&start,&edgeno); num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
h_graph_nodes[i].starting = start; num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
h_graph_nodes[i].no_of_edges = edgeno; }
h_graph_mask[i]=false; work_group_size = num_of_threads_per_block;
h_updating_graph_mask[i]=false; // allocate host memory
h_graph_visited[i]=false; h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
} h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
//read the source node from the file h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
fscanf(fp,"%d",&source); h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
source=0;
//set the source node as true in the mask
h_graph_mask[source]=true;
h_graph_visited[source]=true;
fscanf(fp,"%d",&edge_list_size);
int id,cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
for(int i=0; i < edge_list_size ; i++){
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i] = id;
}
if(fp) int start, edgeno;
fclose(fp); // initalize the memory
// allocate mem for the result on host side for (int i = 0; i < no_of_nodes; i++) {
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); fscanf(fp, "%d %d", &start, &edgeno);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); h_graph_nodes[i].starting = start;
for(int i=0;i<no_of_nodes;i++){ h_graph_nodes[i].no_of_edges = edgeno;
h_cost[i]=-1; h_graph_mask[i] = false;
h_cost_ref[i] = -1; h_updating_graph_mask[i] = false;
} h_graph_visited[i] = false;
h_cost[source]=0; }
h_cost_ref[source]=0; // read the source node from the file
//--------------------------------------------------------- fscanf(fp, "%d", &source);
//--gpu entry source = 0;
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost); // set the source node as true in the mask
//--------------------------------------------------------- h_graph_mask[source] = true;
//--cpu entry h_graph_visited[source] = true;
// initalize the memory again fscanf(fp, "%d", &edge_list_size);
for(int i = 0; i < no_of_nodes; i++){ int id, cost;
h_graph_mask[i]=false; int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
h_updating_graph_mask[i]=false; for (int i = 0; i < edge_list_size; i++) {
h_graph_visited[i]=false; fscanf(fp, "%d", &id);
} fscanf(fp, "%d", &cost);
//set the source node as true in the mask h_graph_edges[i] = id;
source=0; }
h_graph_mask[source]=true;
h_graph_visited[source]=true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} if (fp)
catch(std::string msg){ fclose(fp);
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl; // allocate mem for the result on host side
//release host memory int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
free(h_graph_nodes); int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
free(h_graph_mask); for (int i = 0; i < no_of_nodes; i++) {
free(h_updating_graph_mask); h_cost[i] = -1;
free(h_graph_visited); h_cost_ref[i] = -1;
} }
h_cost[source] = 0;
h_cost_ref[source] = 0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// set the source node as true in the mask
source = 0;
h_graph_mask[source] = true;
h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
return 0; } catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
} }

View File

@@ -3,126 +3,99 @@
#include <iostream> #include <iostream>
class timer { class timer {
public: public:
timer(const char *name = 0); timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit); timer(const char *name, std::ostream &write_on_exit);
~timer(); ~timer();
void start(), stop(); void start(), stop();
void reset(); void reset();
std::ostream &print(std::ostream &); std::ostream &print(std::ostream &);
double getTimeInSeconds(); double getTimeInSeconds();
private: private:
void print_time(std::ostream &, const char *which, double time) const; void print_time(std::ostream &, const char *which, double time) const;
union { union {
long long total_time; long long total_time;
struct { struct {
#if defined __PPC__ #if defined __PPC__
int high, low; int high, low;
#else #else
int low, high; int low, high;
#endif #endif
}; };
}; };
unsigned long long count; unsigned long long count;
const char *const name; const char *const name;
std::ostream *const write_on_exit; std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
}; };
std::ostream &operator<<(std::ostream &, class timer &);
std::ostream &operator << (std::ostream &, class timer &); inline void timer::reset() {
total_time = 0;
count = 0;
inline void timer::reset()
{
total_time = 0;
count = 0;
} }
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
inline timer::timer(const char *name) reset();
:
name(name),
write_on_exit(0)
{
reset();
} }
inline timer::timer(const char *name, std::ostream &write_on_exit) inline timer::timer(const char *name, std::ostream &write_on_exit)
: : name(name), write_on_exit(&write_on_exit) {
name(name), reset();
write_on_exit(&write_on_exit)
{
reset();
} }
inline timer::~timer() {
inline timer::~timer() if (write_on_exit != 0)
{ print(*write_on_exit);
if (write_on_exit != 0)
print(*write_on_exit);
} }
inline void timer::start() {
inline void timer::start()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long) edx << 32) + eax; total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t" "subl %%eax, %0\n\t"
"subl %%eax, %0\n\t" "sbbl %%edx, %1"
"sbbl %%edx, %1" : "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#else #else
#error Compiler/Architecture not recognized #error Compiler/Architecture not recognized
#endif #endif
} }
inline void timer::stop() {
inline void timer::stop()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx; unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long) edx << 32) + eax; total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) #elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
asm volatile (defined __i386 || defined __x86_64)
( asm volatile("rdtsc\n\t"
"rdtsc\n\t" "addl %%eax, %0\n\t"
"addl %%eax, %0\n\t" "adcl %%edx, %1"
"adcl %%edx, %1" : "+m"(low), "+m"(high)
: :
"+m" (low), "+m" (high) : "eax", "edx");
:
:
"eax", "edx"
);
#endif #endif
++ count; ++count;
} }
#endif #endif

View File

@@ -1,44 +1,60 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime) VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main() CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=kmeans PROJECT=kmeans
PROJECT=saxpy
all: $(PROJECT).dump $(PROJECT).hex all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a kmeans_clustering.o: kmeans_clustering.c
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -46,8 +62,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump $(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: run: $(PROJECT).hex
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu $(GDB) $(PROJECT).qemu
clean: clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu