add dasp_spmv3

This commit is contained in:
abnerhexu
2026-01-18 21:46:36 +08:00
parent fff9f18287
commit 5fe43bcd7a
4 changed files with 1531 additions and 0 deletions

View File

@@ -15,10 +15,14 @@ OPTIONS = -Xcompiler -fopenmp-simd
double:
$(CC) $(NVCC_FLAGS) src/main_f64.cu -o spmv_double -D f64 $(OPTIONS) $(LIBS)
double3:
$(CC) $(NVCC_FLAGS) src/main_spmv3_f64.cu src/dasp_spmv3.cu -o spmv_double3 -Isrc -D f64 $(OPTIONS) $(LIBS)
half:
$(CC) $(NVCC_FLAGS) src/main_f16.cu -o spmv_half $(OPTIONS) $(LIBS)
clean:
rm -rf spmv_double
rm -rf spmv_double3
rm -rf spmv_half
rm data/*.csv

1309
DASP/src/dasp_spmv3.cu Normal file

File diff suppressed because it is too large Load Diff

50
DASP/src/dasp_spmv3.h Normal file
View File

@@ -0,0 +1,50 @@
#ifndef DASP_SPMV3_H
#define DASP_SPMV3_H
#include "common.h"
struct DASPSparseMatrix {
// === 1. Long Part ===
MAT_VAL_TYPE *dlongA_val;
int *dlongA_cid;
MAT_VAL_TYPE *dwarp_val;
MAT_PTR_TYPE *dlongA_rpt;
int row_long;
// === 2. Regular Block Part ===
MAT_VAL_TYPE *dregA_val;
int *dregA_cid;
MAT_PTR_TYPE *dblockA_ptr;
int row_block;
int blocknum;
// === 3. Irregular Part ===
MAT_VAL_TYPE *dirregA_val;
int *dirregA_cid;
MAT_PTR_TYPE *dirregA_rpt;
// === 4. Short Part ===
MAT_VAL_TYPE *dshort_val;
int *dshort_cid;
int short_row_1;
int common_13;
int short_row_34;
int short_row_2;
MAT_PTR_TYPE fill0_nnz_short13;
MAT_PTR_TYPE fill0_nnz_short34;
// === 5. Scheduling Offsets ===
int offset_reg;
int offset_short1;
int offset_short13;
int offset_short34;
int offset_short22;
};
template <int rowloop>
__global__ void dasp_spmv3(MAT_VAL_TYPE *dX_val, MAT_VAL_TYPE *dY_val, DASPSparseMatrix *A);
__host__ void spmv_all3(char *filename, MAT_VAL_TYPE *csrValA, MAT_PTR_TYPE *csrRowPtrA, int *csrColIdxA,
MAT_VAL_TYPE *X_val, MAT_VAL_TYPE *Y_val, int *order_rid, int rowA, int colA, MAT_PTR_TYPE nnzA, int NUM, double threshold, int block_longest);
#endif

168
DASP/src/main_spmv3_f64.cu Normal file
View File

@@ -0,0 +1,168 @@
#include "dasp_spmv3.h"
int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
{
for (int i = 0; i < length; i ++)
{
int cusp_idx = new_order[i];
if (fabs(cusp_val[cusp_idx] - cuda_val[i]) > 1e-5)
{
printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, cusp_val[cusp_idx], cuda_val[i]);
return -1;
}
}
printf("Y(%d), compute succeed!\n", length);
return 0;
}
__host__
void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA,
MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
long long int data_origin1, long long int data_origin2, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2, double *cu_pre)
{
struct timeval t1, t2;
MAT_VAL_TYPE *dA_val, *dX, *dY;
int *dA_cid;
MAT_PTR_TYPE *dA_rpt;
MAT_VAL_TYPE alpha = 1.0, beta = 0.0;
cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
gettimeofday(&t1, NULL);
cusparseCreate(&handle);
cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_64F);
cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_64F);
cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
cudaMalloc(&dBuffer, bufferSize);
// cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double cusparse_pre = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
// printf("cusparse preprocessing time: %8.4lf ms\n", cusparse_pre);
*cu_pre = cusparse_pre;
for (int i = 0; i < 100; ++i)
{
cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
}
cudaDeviceSynchronize();
gettimeofday(&t1, NULL);
for (int i = 0; i < 1000; ++i)
{
cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
}
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
*cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
*cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
*cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6);
*cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6);
printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
cusparseDestroySpMat(matA);
cusparseDestroyDnVec(vecX);
cusparseDestroyDnVec(vecY);
cusparseDestroy(handle);
cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
cudaFree(dA_val);
cudaFree(dA_cid);
cudaFree(dA_rpt);
cudaFree(dX);
cudaFree(dY);
}
__host__
int main(int argc, char **argv)
{
if (argc < 2)
{
printf("Run the code by './spmv_double3 matrix.mtx'. \n");
return 0;
}
// struct timeval t1, t2;
int rowA, colA;
MAT_PTR_TYPE nnzA;
int isSymmetricA;
MAT_VAL_TYPE *csrValA;
int *csrColIdxA;
MAT_PTR_TYPE *csrRowPtrA;
char *filename;
filename = argv[1];
// int NUM = atoi(argv[2]);
// int block_longest = atoi(argv[3]);
int NUM = 4;
int block_longest = 256;
double threshold = 0.75;
printf("\n===%s===\n\n", filename);
mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
initVec(X_val, colA);
initVec(csrValA, nnzA);
printf("INIT DONE\n");
MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
int *new_order = (int *)malloc(sizeof(int) * rowA);
// int warmup = 3, pre_num = 10;
double cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0, cu_pre = 0;
long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2, &cu_pre);
// double dasp_pre = 0;
spmv_all3(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
FILE* fout;
fout = fopen("data/spmv_f64_record.csv", "a");
fprintf(fout, "%lld,%lf,%lf,%lf,%lf\n", data_origin1, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
fclose(fout);
/* verify the result with cusparse */
// int result = verify_new(dY_val, Y_val, new_order, rowA);
free(X_val);
free(Y_val);
free(dY_val);
free(csrColIdxA);
free(csrRowPtrA);
free(csrValA);
free(new_order);
return 0;
}