add dasp_spmv3
This commit is contained in:
@@ -15,10 +15,14 @@ OPTIONS = -Xcompiler -fopenmp-simd
|
||||
double:
|
||||
$(CC) $(NVCC_FLAGS) src/main_f64.cu -o spmv_double -D f64 $(OPTIONS) $(LIBS)
|
||||
|
||||
double3:
|
||||
$(CC) $(NVCC_FLAGS) src/main_spmv3_f64.cu src/dasp_spmv3.cu -o spmv_double3 -Isrc -D f64 $(OPTIONS) $(LIBS)
|
||||
|
||||
half:
|
||||
$(CC) $(NVCC_FLAGS) src/main_f16.cu -o spmv_half $(OPTIONS) $(LIBS)
|
||||
|
||||
clean:
|
||||
rm -rf spmv_double
|
||||
rm -rf spmv_double3
|
||||
rm -rf spmv_half
|
||||
rm data/*.csv
|
||||
1309
DASP/src/dasp_spmv3.cu
Normal file
1309
DASP/src/dasp_spmv3.cu
Normal file
File diff suppressed because it is too large
Load Diff
50
DASP/src/dasp_spmv3.h
Normal file
50
DASP/src/dasp_spmv3.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef DASP_SPMV3_H
|
||||
#define DASP_SPMV3_H
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct DASPSparseMatrix {
|
||||
// === 1. Long Part ===
|
||||
MAT_VAL_TYPE *dlongA_val;
|
||||
int *dlongA_cid;
|
||||
MAT_VAL_TYPE *dwarp_val;
|
||||
MAT_PTR_TYPE *dlongA_rpt;
|
||||
int row_long;
|
||||
|
||||
// === 2. Regular Block Part ===
|
||||
MAT_VAL_TYPE *dregA_val;
|
||||
int *dregA_cid;
|
||||
MAT_PTR_TYPE *dblockA_ptr;
|
||||
int row_block;
|
||||
int blocknum;
|
||||
|
||||
// === 3. Irregular Part ===
|
||||
MAT_VAL_TYPE *dirregA_val;
|
||||
int *dirregA_cid;
|
||||
MAT_PTR_TYPE *dirregA_rpt;
|
||||
|
||||
// === 4. Short Part ===
|
||||
MAT_VAL_TYPE *dshort_val;
|
||||
int *dshort_cid;
|
||||
int short_row_1;
|
||||
int common_13;
|
||||
int short_row_34;
|
||||
int short_row_2;
|
||||
MAT_PTR_TYPE fill0_nnz_short13;
|
||||
MAT_PTR_TYPE fill0_nnz_short34;
|
||||
|
||||
// === 5. Scheduling Offsets ===
|
||||
int offset_reg;
|
||||
int offset_short1;
|
||||
int offset_short13;
|
||||
int offset_short34;
|
||||
int offset_short22;
|
||||
};
|
||||
|
||||
template <int rowloop>
|
||||
__global__ void dasp_spmv3(MAT_VAL_TYPE *dX_val, MAT_VAL_TYPE *dY_val, DASPSparseMatrix *A);
|
||||
|
||||
__host__ void spmv_all3(char *filename, MAT_VAL_TYPE *csrValA, MAT_PTR_TYPE *csrRowPtrA, int *csrColIdxA,
|
||||
MAT_VAL_TYPE *X_val, MAT_VAL_TYPE *Y_val, int *order_rid, int rowA, int colA, MAT_PTR_TYPE nnzA, int NUM, double threshold, int block_longest);
|
||||
|
||||
#endif
|
||||
168
DASP/src/main_spmv3_f64.cu
Normal file
168
DASP/src/main_spmv3_f64.cu
Normal file
@@ -0,0 +1,168 @@
|
||||
#include "dasp_spmv3.h"
|
||||
|
||||
int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
|
||||
{
|
||||
for (int i = 0; i < length; i ++)
|
||||
{
|
||||
int cusp_idx = new_order[i];
|
||||
if (fabs(cusp_val[cusp_idx] - cuda_val[i]) > 1e-5)
|
||||
{
|
||||
printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, cusp_val[cusp_idx], cuda_val[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
printf("Y(%d), compute succeed!\n", length);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__host__
|
||||
void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA,
|
||||
MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
|
||||
long long int data_origin1, long long int data_origin2, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2, double *cu_pre)
|
||||
{
|
||||
struct timeval t1, t2;
|
||||
|
||||
MAT_VAL_TYPE *dA_val, *dX, *dY;
|
||||
int *dA_cid;
|
||||
MAT_PTR_TYPE *dA_rpt;
|
||||
MAT_VAL_TYPE alpha = 1.0, beta = 0.0;
|
||||
|
||||
cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
|
||||
cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
|
||||
cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
|
||||
cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
|
||||
cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
|
||||
|
||||
cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
|
||||
cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
|
||||
|
||||
cusparseHandle_t handle = NULL;
|
||||
cusparseSpMatDescr_t matA;
|
||||
cusparseDnVecDescr_t vecX, vecY;
|
||||
void* dBuffer = NULL;
|
||||
size_t bufferSize = 0;
|
||||
|
||||
gettimeofday(&t1, NULL);
|
||||
cusparseCreate(&handle);
|
||||
cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
|
||||
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
|
||||
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
|
||||
cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_64F);
|
||||
cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_64F);
|
||||
cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
|
||||
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
|
||||
cudaMalloc(&dBuffer, bufferSize);
|
||||
// cudaDeviceSynchronize();
|
||||
gettimeofday(&t2, NULL);
|
||||
double cusparse_pre = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
|
||||
// printf("cusparse preprocessing time: %8.4lf ms\n", cusparse_pre);
|
||||
*cu_pre = cusparse_pre;
|
||||
|
||||
for (int i = 0; i < 100; ++i)
|
||||
{
|
||||
cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
|
||||
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
gettimeofday(&t1, NULL);
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
{
|
||||
cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
|
||||
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&t2, NULL);
|
||||
*cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
|
||||
*cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
|
||||
*cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6);
|
||||
*cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6);
|
||||
printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
|
||||
|
||||
cusparseDestroySpMat(matA);
|
||||
cusparseDestroyDnVec(vecX);
|
||||
cusparseDestroyDnVec(vecY);
|
||||
cusparseDestroy(handle);
|
||||
|
||||
cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(dA_val);
|
||||
cudaFree(dA_cid);
|
||||
cudaFree(dA_rpt);
|
||||
cudaFree(dX);
|
||||
cudaFree(dY);
|
||||
}
|
||||
|
||||
__host__
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 2)
|
||||
{
|
||||
printf("Run the code by './spmv_double3 matrix.mtx'. \n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// struct timeval t1, t2;
|
||||
int rowA, colA;
|
||||
MAT_PTR_TYPE nnzA;
|
||||
int isSymmetricA;
|
||||
MAT_VAL_TYPE *csrValA;
|
||||
int *csrColIdxA;
|
||||
MAT_PTR_TYPE *csrRowPtrA;
|
||||
|
||||
char *filename;
|
||||
filename = argv[1];
|
||||
// int NUM = atoi(argv[2]);
|
||||
// int block_longest = atoi(argv[3]);
|
||||
int NUM = 4;
|
||||
int block_longest = 256;
|
||||
double threshold = 0.75;
|
||||
|
||||
printf("\n===%s===\n\n", filename);
|
||||
|
||||
mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
|
||||
MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
|
||||
initVec(X_val, colA);
|
||||
initVec(csrValA, nnzA);
|
||||
|
||||
printf("INIT DONE\n");
|
||||
|
||||
MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
|
||||
MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
|
||||
int *new_order = (int *)malloc(sizeof(int) * rowA);
|
||||
|
||||
// int warmup = 3, pre_num = 10;
|
||||
|
||||
double cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0, cu_pre = 0;
|
||||
long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
|
||||
long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
|
||||
|
||||
cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2, &cu_pre);
|
||||
|
||||
// double dasp_pre = 0;
|
||||
spmv_all3(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
|
||||
|
||||
FILE* fout;
|
||||
fout = fopen("data/spmv_f64_record.csv", "a");
|
||||
fprintf(fout, "%lld,%lf,%lf,%lf,%lf\n", data_origin1, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
|
||||
fclose(fout);
|
||||
|
||||
/* verify the result with cusparse */
|
||||
// int result = verify_new(dY_val, Y_val, new_order, rowA);
|
||||
|
||||
free(X_val);
|
||||
free(Y_val);
|
||||
free(dY_val);
|
||||
free(csrColIdxA);
|
||||
free(csrRowPtrA);
|
||||
free(csrValA);
|
||||
free(new_order);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user