add dasp_spmv3

2026-01-18 21:46:36 +08:00
parent fff9f18287
commit 5fe43bcd7a
4 changed files with 1531 additions and 0 deletions
--- a/DASP/Makefile
+++ b/DASP/Makefile
@@ -15,10 +15,14 @@ OPTIONS = -Xcompiler -fopenmp-simd
 double:
 	$(CC) $(NVCC_FLAGS) src/main_f64.cu -o spmv_double  -D f64 $(OPTIONS) $(LIBS) 

+double3:
+	$(CC) $(NVCC_FLAGS) src/main_spmv3_f64.cu src/dasp_spmv3.cu -o spmv_double3 -Isrc -D f64 $(OPTIONS) $(LIBS)
+
 half:
 	$(CC) $(NVCC_FLAGS) src/main_f16.cu -o spmv_half $(OPTIONS) $(LIBS) 

 clean:
 	rm -rf spmv_double
+	rm -rf spmv_double3
 	rm -rf spmv_half
 	rm data/*.csv
--- a/DASP/src/dasp_spmv3.cu
+++ b/DASP/src/dasp_spmv3.cu
--- a/DASP/src/dasp_spmv3.h
+++ b/DASP/src/dasp_spmv3.h
@@ -0,0 +1,50 @@
+#ifndef DASP_SPMV3_H
+#define DASP_SPMV3_H
+
+#include "common.h"
+
+struct DASPSparseMatrix {
+    // === 1. Long Part ===
+    MAT_VAL_TYPE *dlongA_val;
+    int          *dlongA_cid;
+    MAT_VAL_TYPE *dwarp_val;
+    MAT_PTR_TYPE *dlongA_rpt;
+    int           row_long;
+
+    // === 2. Regular Block Part ===
+    MAT_VAL_TYPE *dregA_val;
+    int          *dregA_cid;
+    MAT_PTR_TYPE *dblockA_ptr;
+    int           row_block;
+    int           blocknum;
+
+    // === 3. Irregular Part ===
+    MAT_VAL_TYPE *dirregA_val;
+    int          *dirregA_cid;
+    MAT_PTR_TYPE *dirregA_rpt;
+
+    // === 4. Short Part ===
+    MAT_VAL_TYPE *dshort_val;
+    int          *dshort_cid;
+    int           short_row_1;
+    int           common_13;
+    int           short_row_34;
+    int           short_row_2;
+    MAT_PTR_TYPE  fill0_nnz_short13;
+    MAT_PTR_TYPE  fill0_nnz_short34;
+
+    // === 5. Scheduling Offsets ===
+    int offset_reg;
+    int offset_short1;
+    int offset_short13;
+    int offset_short34;
+    int offset_short22;
+};
+
+template <int rowloop>
+__global__ void dasp_spmv3(MAT_VAL_TYPE *dX_val, MAT_VAL_TYPE *dY_val, DASPSparseMatrix *A);
+
+__host__ void spmv_all3(char *filename, MAT_VAL_TYPE *csrValA, MAT_PTR_TYPE *csrRowPtrA, int *csrColIdxA, 
+    MAT_VAL_TYPE *X_val, MAT_VAL_TYPE *Y_val, int *order_rid, int rowA, int colA, MAT_PTR_TYPE nnzA, int NUM, double threshold, int block_longest);
+
+#endif
--- a/DASP/src/main_spmv3_f64.cu
+++ b/DASP/src/main_spmv3_f64.cu
@@ -0,0 +1,168 @@
+#include "dasp_spmv3.h"
+
+int verify_new(MAT_VAL_TYPE *cusp_val, MAT_VAL_TYPE *cuda_val, int *new_order, int length)
+{
+    for (int i = 0; i < length; i ++)
+    {
+        int cusp_idx = new_order[i];
+        if (fabs(cusp_val[cusp_idx] - cuda_val[i]) > 1e-5)
+        {
+            printf("error in (%d), cusp(%4.2f), cuda(%4.2f),please check your code!\n", i, cusp_val[cusp_idx], cuda_val[i]);
+            return -1;
+        }
+    }
+    printf("Y(%d), compute succeed!\n", length);
+    return 0;
+}
+
+__host__
+void cusparse_spmv_all(MAT_VAL_TYPE *cu_ValA, MAT_PTR_TYPE *cu_RowPtrA, int *cu_ColIdxA, 
+                       MAT_VAL_TYPE *cu_ValX, MAT_VAL_TYPE *cu_ValY, int rowA, int colA, MAT_PTR_TYPE nnzA,
+                       long long int data_origin1, long long int data_origin2, double *cu_time, double *cu_gflops, double *cu_bandwidth1, double *cu_bandwidth2, double *cu_pre)
+{
+    struct timeval t1, t2;
+
+    MAT_VAL_TYPE *dA_val, *dX, *dY;
+    int *dA_cid;
+    MAT_PTR_TYPE *dA_rpt;
+    MAT_VAL_TYPE alpha = 1.0, beta = 0.0;
+
+    cudaMalloc((void **)&dA_val, sizeof(MAT_VAL_TYPE) * nnzA);
+    cudaMalloc((void **)&dA_cid, sizeof(int) * nnzA);
+    cudaMalloc((void **)&dA_rpt, sizeof(MAT_PTR_TYPE) * (rowA + 1));
+    cudaMalloc((void **)&dX, sizeof(MAT_VAL_TYPE) * colA);
+    cudaMalloc((void **)&dY, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cudaMemcpy(dA_val, cu_ValA, sizeof(MAT_VAL_TYPE) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_cid, cu_ColIdxA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
+    cudaMemcpy(dA_rpt, cu_RowPtrA, sizeof(MAT_PTR_TYPE) * (rowA + 1), cudaMemcpyHostToDevice);
+    cudaMemcpy(dX, cu_ValX, sizeof(MAT_VAL_TYPE) * colA, cudaMemcpyHostToDevice);
+    cudaMemset(dY, 0.0, sizeof(MAT_VAL_TYPE) * rowA);
+
+    cusparseHandle_t     handle = NULL;
+    cusparseSpMatDescr_t matA;
+    cusparseDnVecDescr_t vecX, vecY;
+    void*                dBuffer = NULL;
+    size_t               bufferSize = 0;
+
+    gettimeofday(&t1, NULL);
+    cusparseCreate(&handle);
+    cusparseCreateCsr(&matA, rowA, colA, nnzA, dA_rpt, dA_cid, dA_val,
+                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+    cusparseCreateDnVec(&vecX, colA, dX, CUDA_R_64F);
+    cusparseCreateDnVec(&vecY, rowA, dY, CUDA_R_64F);
+    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
+    cudaMalloc(&dBuffer, bufferSize);
+    // cudaDeviceSynchronize();
+    gettimeofday(&t2, NULL);
+    double cusparse_pre = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
+    // printf("cusparse preprocessing time: %8.4lf ms\n", cusparse_pre);
+    *cu_pre = cusparse_pre;
+
+    for (int i = 0; i < 100; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+
+    gettimeofday(&t1, NULL);
+    for (int i = 0; i < 1000; ++i)
+    {
+        cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
+                    CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&t2, NULL);
+    *cu_time = ((t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0) / 1000;
+    *cu_gflops = (double)((long)nnzA * 2) / (*cu_time * 1e6);
+    *cu_bandwidth1 = (double)data_origin1 / (*cu_time * 1e6); 
+    *cu_bandwidth2 = (double)data_origin2 / (*cu_time * 1e6); 
+    printf("cusparse:%8.4lf ms, %8.4lf Gflop/s, %9.4lf GB/s, %9.4lf GB/s\n", *cu_time, *cu_gflops, *cu_bandwidth1, *cu_bandwidth2);
+
+    cusparseDestroySpMat(matA);
+    cusparseDestroyDnVec(vecX);
+    cusparseDestroyDnVec(vecY);
+    cusparseDestroy(handle);
+
+    cudaMemcpy(cu_ValY, dY, sizeof(MAT_VAL_TYPE) * rowA, cudaMemcpyDeviceToHost);
+
+    cudaFree(dA_val);
+    cudaFree(dA_cid);
+    cudaFree(dA_rpt);
+    cudaFree(dX);
+    cudaFree(dY);
+}
+
+__host__
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        printf("Run the code by './spmv_double3 matrix.mtx'. \n");
+        return 0;
+    }
+
+    // struct timeval t1, t2;
+    int rowA, colA;
+    MAT_PTR_TYPE nnzA;
+    int isSymmetricA;
+    MAT_VAL_TYPE *csrValA;
+    int *csrColIdxA;
+    MAT_PTR_TYPE *csrRowPtrA;
+
+    char *filename;
+    filename = argv[1];
+    // int NUM = atoi(argv[2]);
+    // int block_longest = atoi(argv[3]);
+    int NUM = 4;
+    int block_longest = 256;
+    double threshold = 0.75;
+
+    printf("\n===%s===\n\n", filename);
+
+    mmio_allinone(&rowA, &colA, &nnzA, &isSymmetricA, &csrRowPtrA, &csrColIdxA, &csrValA, filename);
+    MAT_VAL_TYPE *X_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * colA);
+    initVec(X_val, colA);
+    initVec(csrValA, nnzA);
+
+    printf("INIT DONE\n");
+
+    MAT_VAL_TYPE *dY_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    MAT_VAL_TYPE *Y_val = (MAT_VAL_TYPE *)malloc(sizeof(MAT_VAL_TYPE) * rowA);
+    int *new_order = (int *)malloc(sizeof(int) * rowA);
+
+    // int warmup = 3, pre_num = 10;
+    
+    double cu_time = 0, cu_gflops = 0, cu_bandwidth1 = 0, cu_bandwidth2 = 0, cu_pre = 0;
+    long long int data_origin1 = (nnzA + colA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+    long long int data_origin2 = (nnzA + nnzA + rowA) * sizeof(MAT_VAL_TYPE) + nnzA * sizeof(int) + (rowA + 1) * sizeof(MAT_PTR_TYPE);
+    
+    cusparse_spmv_all(csrValA, csrRowPtrA, csrColIdxA, X_val, dY_val, rowA, colA, nnzA, data_origin1, data_origin2, &cu_time, &cu_gflops, &cu_bandwidth1, &cu_bandwidth2, &cu_pre);
+    
+    // double dasp_pre = 0;
+    spmv_all3(filename, csrValA, csrRowPtrA, csrColIdxA, X_val, Y_val, new_order, rowA, colA, nnzA, NUM, threshold, block_longest);
+
+    FILE* fout;
+    fout = fopen("data/spmv_f64_record.csv", "a");
+    fprintf(fout, "%lld,%lf,%lf,%lf,%lf\n", data_origin1, cu_time, cu_gflops, cu_bandwidth1, cu_bandwidth2);
+    fclose(fout);
+    
+    /* verify the result with cusparse */
+    // int result = verify_new(dY_val, Y_val, new_order, rowA);
+
+    free(X_val);
+    free(Y_val);
+    free(dY_val);
+    free(csrColIdxA);
+    free(csrRowPtrA);
+    free(csrValA);
+    free(new_order);
+
+    return 0;
+}