minor updates
This commit is contained in:
@@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||||
.DATAW (RSP_ARB_DATAW),
|
.DATAW (RSP_ARB_DATAW),
|
||||||
.ARBITER ("R"),
|
.ARBITER ("R"),
|
||||||
.OUT_REG (1)
|
.OUT_REG (3)
|
||||||
) rsp_arb (
|
) rsp_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
@@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
VX_gather_unit #(
|
VX_gather_unit #(
|
||||||
.BLOCK_SIZE (BLOCK_SIZE),
|
.BLOCK_SIZE (BLOCK_SIZE),
|
||||||
.NUM_LANES (NUM_LANES),
|
.NUM_LANES (NUM_LANES),
|
||||||
.OUT_REG (3)
|
.OUT_REG (1)
|
||||||
) gather_unit (
|
) gather_unit (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
|
|||||||
@@ -50,9 +50,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
|||||||
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||||
CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
||||||
CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8
|
CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8
|
||||||
CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
|
CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
|
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
|
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
||||||
|
|
||||||
# include paths
|
# include paths
|
||||||
|
|||||||
@@ -67,9 +67,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
|||||||
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||||
CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
||||||
CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8
|
CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8
|
||||||
CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
|
CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
|
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
|
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
|
||||||
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
||||||
|
|
||||||
# include paths
|
# include paths
|
||||||
|
|||||||
@@ -56,7 +56,6 @@ PROJECT = top_modules
|
|||||||
all: build
|
all: build
|
||||||
|
|
||||||
build: $(SRCS)
|
build: $(SRCS)
|
||||||
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
|
|
||||||
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||||
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||||
|
|
||||||
|
|||||||
@@ -31,16 +31,16 @@ run-simx:
|
|||||||
$(MAKE) -C dotproduct run-simx
|
$(MAKE) -C dotproduct run-simx
|
||||||
$(MAKE) -C kmeans run-simx
|
$(MAKE) -C kmeans run-simx
|
||||||
$(MAKE) -C spmv run-simx
|
$(MAKE) -C spmv run-simx
|
||||||
$(MAKE) -C cutcp run-simx
|
|
||||||
$(MAKE) -C stencil run-simx
|
$(MAKE) -C stencil run-simx
|
||||||
$(MAKE) -C lbm run-simx
|
$(MAKE) -C lbm run-simx
|
||||||
$(MAKE) -C oclprintf run-simx
|
$(MAKE) -C oclprintf run-simx
|
||||||
$(MAKE) -C blackscholes run-simx
|
$(MAKE) -C blackscholes run-simx
|
||||||
$(MAKE) -C transpose run-simx
|
$(MAKE) -C transpose run-simx
|
||||||
$(MAKE) -C convolution run-simx
|
$(MAKE) -C convolution run-simx
|
||||||
# $(MAKE) -C matmul run-simx
|
$(MAKE) -C cutcp run-simx
|
||||||
# $(MAKE) -C vectorhypot run-simx
|
$(MAKE) -C matmul run-simx
|
||||||
# $(MAKE) -C mri-q run-simx
|
$(MAKE) -C vectorhypot run-simx
|
||||||
|
$(MAKE) -C mri-q run-simx
|
||||||
|
|
||||||
run-rtlsim:
|
run-rtlsim:
|
||||||
$(MAKE) -C vecadd run-rtlsim
|
$(MAKE) -C vecadd run-rtlsim
|
||||||
@@ -54,12 +54,12 @@ run-rtlsim:
|
|||||||
$(MAKE) -C kmeans run-rtlsim
|
$(MAKE) -C kmeans run-rtlsim
|
||||||
$(MAKE) -C spmv run-rtlsim
|
$(MAKE) -C spmv run-rtlsim
|
||||||
$(MAKE) -C transpose run-rtlsim
|
$(MAKE) -C transpose run-rtlsim
|
||||||
$(MAKE) -C cutcp run-rtlsim
|
|
||||||
$(MAKE) -C stencil run-rtlsim
|
$(MAKE) -C stencil run-rtlsim
|
||||||
$(MAKE) -C lbm run-rtlsim
|
$(MAKE) -C lbm run-rtlsim
|
||||||
$(MAKE) -C oclprintf run-rtlsim
|
$(MAKE) -C oclprintf run-rtlsim
|
||||||
$(MAKE) -C blackscholes run-rtlsim
|
$(MAKE) -C blackscholes run-rtlsim
|
||||||
$(MAKE) -C convolution run-rtlsim
|
$(MAKE) -C convolution run-rtlsim
|
||||||
|
# $(MAKE) -C cutcp run-rtlsim
|
||||||
# $(MAKE) -C matmul run-rtlsim
|
# $(MAKE) -C matmul run-rtlsim
|
||||||
# $(MAKE) -C vectorhypot run-rtlsim
|
# $(MAKE) -C vectorhypot run-rtlsim
|
||||||
# $(MAKE) -C mri-q run-rtlsim
|
# $(MAKE) -C mri-q run-rtlsim
|
||||||
@@ -76,12 +76,12 @@ run-opae:
|
|||||||
$(MAKE) -C kmeans run-opae
|
$(MAKE) -C kmeans run-opae
|
||||||
$(MAKE) -C spmv run-opae
|
$(MAKE) -C spmv run-opae
|
||||||
$(MAKE) -C transpose run-opae
|
$(MAKE) -C transpose run-opae
|
||||||
$(MAKE) -C cutcp run-opae
|
|
||||||
$(MAKE) -C stencil run-opae
|
$(MAKE) -C stencil run-opae
|
||||||
$(MAKE) -C lbm run-opae
|
$(MAKE) -C lbm run-opae
|
||||||
$(MAKE) -C oclprintf run-opae
|
$(MAKE) -C oclprintf run-opae
|
||||||
$(MAKE) -C blackscholes run-opae
|
$(MAKE) -C blackscholes run-opae
|
||||||
$(MAKE) -C convolution run-opae
|
$(MAKE) -C convolution run-opae
|
||||||
|
# $(MAKE) -C cutcp run-opae
|
||||||
# $(MAKE) -C matmul run-opae
|
# $(MAKE) -C matmul run-opae
|
||||||
# $(MAKE) -C vectorhypot run-opae
|
# $(MAKE) -C vectorhypot run-opae
|
||||||
# $(MAKE) -C mri-q run-opae
|
# $(MAKE) -C mri-q run-opae
|
||||||
|
|||||||
@@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
|||||||
|
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||||
|
|
||||||
printf("OK+\n");
|
|
||||||
|
|
||||||
// Setup DEVICE datastructures
|
// Setup DEVICE datastructures
|
||||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
|
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
|
||||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
|
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
|
||||||
|
|
||||||
printf("OK-\n");
|
|
||||||
|
|
||||||
// Initialize DEVICE datastructures
|
// Initialize DEVICE datastructures
|
||||||
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
|
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
|
||||||
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
|
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
|
||||||
|
|||||||
@@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
|
|||||||
if ((f = fopen(mtx_filename, "r")) == NULL)
|
if ((f = fopen(mtx_filename, "r")) == NULL)
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
||||||
printf("OK**\n");
|
|
||||||
|
|
||||||
if (mm_read_banner(f, &matcode) != 0) {
|
if (mm_read_banner(f, &matcode) != 0) {
|
||||||
printf("Could not process Matrix Market banner.\n");
|
printf("Could not process Matrix Market banner.\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("OK**\n");
|
|
||||||
|
|
||||||
/* This is how one can screen matrix types if their application */
|
/* This is how one can screen matrix types if their application */
|
||||||
/* only supports a subset of the Matrix Market data types. */
|
/* only supports a subset of the Matrix Market data types. */
|
||||||
|
|
||||||
|
|||||||
@@ -148,7 +148,6 @@ int main(int argc, char **argv) {
|
|||||||
// &h_data, &h_indices, &h_ptr,
|
// &h_data, &h_indices, &h_ptr,
|
||||||
// &h_perm, &h_nzcnt);
|
// &h_perm, &h_nzcnt);
|
||||||
int col_count;
|
int col_count;
|
||||||
printf("OK--\n");
|
|
||||||
coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
|
coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
|
||||||
1, // row padding
|
1, // row padding
|
||||||
pad, // warp size
|
pad, // warp size
|
||||||
@@ -159,8 +158,6 @@ int main(int argc, char **argv) {
|
|||||||
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
|
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
|
||||||
&len, &nzcnt_len, &depth);
|
&len, &nzcnt_len, &depth);
|
||||||
|
|
||||||
printf("OK++\n");
|
|
||||||
|
|
||||||
// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
h_Ax_vector = (float *)malloc(sizeof(float) * dim);
|
h_Ax_vector = (float *)malloc(sizeof(float) * dim);
|
||||||
h_x_vector = (float *)malloc(sizeof(float) * dim);
|
h_x_vector = (float *)malloc(sizeof(float) * dim);
|
||||||
|
|||||||
@@ -159,8 +159,6 @@ int main(int argc, char** argv) {
|
|||||||
cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
|
cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
|
||||||
CHECK_ERROR("clCreateKernel")
|
CHECK_ERROR("clCreateKernel")
|
||||||
|
|
||||||
printf("OK+\n");
|
|
||||||
|
|
||||||
//host data
|
//host data
|
||||||
float *h_A0;
|
float *h_A0;
|
||||||
float *h_Anext;
|
float *h_Anext;
|
||||||
@@ -177,16 +175,12 @@ int main(int argc, char** argv) {
|
|||||||
h_Anext=(float*)malloc(sizeof(float)*size);
|
h_Anext=(float*)malloc(sizeof(float)*size);
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
//FILE *fp = fopen(parameters->inpFiles[0], "rb");
|
//FILE *fp = fopen(parameters->inpFiles[0], "rb");
|
||||||
printf("OK+\n");
|
|
||||||
read_data(h_A0, nx,ny,nz,NULL);
|
read_data(h_A0, nx,ny,nz,NULL);
|
||||||
printf("OK+\n");
|
//fclose(fp);
|
||||||
//fclose(fp);
|
memcpy (h_Anext,h_A0,sizeof(float)*size);
|
||||||
memcpy (h_Anext,h_A0,sizeof(float)*size);
|
|
||||||
|
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||||
|
|
||||||
printf("OK+\n");
|
|
||||||
|
|
||||||
//memory allocation
|
//memory allocation
|
||||||
d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
|
d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
|
||||||
CHECK_ERROR("clCreateBuffer")
|
CHECK_ERROR("clCreateBuffer")
|
||||||
@@ -201,18 +195,16 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
|
||||||
printf("OK+\n");
|
|
||||||
|
|
||||||
//only use 1D thread block
|
//only use 1D thread block
|
||||||
int tx = 128;
|
int tx = 128;
|
||||||
size_t block[3] = {tx,1,1};
|
size_t block[3] = {tx,1,1};
|
||||||
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
|
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
|
||||||
//size_t grid[3] = {nx-2,ny-2,nz-2};
|
//size_t grid[3] = {nx-2,ny-2,nz-2};
|
||||||
size_t offset[3] = {1,1,1};
|
size_t offset[3] = {1,1,1};
|
||||||
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
|
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
|
||||||
printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
|
printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
|
||||||
|
|
||||||
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
|
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
|
||||||
|
|
||||||
clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
|
clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
|
||||||
clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
|
clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
|
||||||
@@ -226,14 +218,10 @@ int main(int argc, char** argv) {
|
|||||||
//main execution
|
//main execution
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
||||||
|
|
||||||
printf("OK+0\n");
|
|
||||||
|
|
||||||
int t;
|
int t;
|
||||||
for(t=0;t<iteration;t++)
|
for(t=0;t<iteration;t++)
|
||||||
{
|
{
|
||||||
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
|
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
|
||||||
printf("OK+0\n");
|
|
||||||
|
|
||||||
//printf("iteration %d\n",t)
|
//printf("iteration %d\n",t)
|
||||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||||
|
|
||||||
@@ -244,11 +232,9 @@ int main(int argc, char** argv) {
|
|||||||
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
|
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("OK+1\n");
|
cl_mem d_temp = d_A0;
|
||||||
|
d_A0 = d_Anext;
|
||||||
cl_mem d_temp = d_A0;
|
d_Anext = d_temp;
|
||||||
d_A0 = d_Anext;
|
|
||||||
d_Anext = d_temp;
|
|
||||||
|
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||||
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
|
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
|
||||||
@@ -261,8 +247,6 @@ int main(int argc, char** argv) {
|
|||||||
clStatus = clReleaseContext(clContext);
|
clStatus = clReleaseContext(clContext);
|
||||||
CHECK_ERROR("clReleaseContext")
|
CHECK_ERROR("clReleaseContext")
|
||||||
|
|
||||||
printf("OK+2\n");
|
|
||||||
|
|
||||||
if (parameters->outFile) {
|
if (parameters->outFile) {
|
||||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
//outputData(parameters->outFile,h_Anext,nx,ny,nz);
|
//outputData(parameters->outFile,h_Anext,nx,ny,nz);
|
||||||
|
|||||||
Reference in New Issue
Block a user