feat: port GPU code to CUDA 13 and enable GPU computation

Major changes:
   - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050)
   - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize()
   - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility
   - Fix duplicate function definitions (compare_result_gpu, SHStep)
   - Fix syntax error in bssn_step_gpu.C
   - Enable GPU calculation in AMSS_NCKU_Input.py
   - Successfully build ABEGPU executable
This commit is contained in:
CGH0S7
2026-01-13 18:15:49 +00:00
parent b27e071cde
commit 75be0968fc
6 changed files with 83 additions and 68 deletions

View File

@@ -18,7 +18,7 @@ using namespace std;
#include <fstream>
#endif
void compare_result_gpu(int ftag1,double * datac,int data_num){
static void compare_result_gpu(int ftag1,double * datac,int data_num){
double * data = (double*)malloc(sizeof(double)*data_num);
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
compare_result(ftag1,data,data_num);
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
double * trA = M_ chin1;
enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
cudaMemset(trA,0,matrix_size * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
#endif //ifdef Vertex
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void computeRicci_part1(double * dst)
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}/*Exception*/
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -617,9 +617,9 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void compute_rhs_bssn_part1()
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#ifdef TIMING1
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl;
#endif
//cout<<"GPU meta data ready.\n";
cudaThreadSynchronize();
cudaDeviceSynchronize();
//--------------test constant memory address & value--------------
/* double rank = mpi_rank;
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//sub_enforce_ga(matrix_size);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
cudaThreadSynchronize();
cudaDeviceSynchronize();
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
#if (ABV == 1)
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//-------------------FOR GPU TEST----------------------
//-----------------------------------------------------
#ifdef TIMING
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
#endif