From 75be0968fcad52b974f6aec931ff1060126e97a8 Mon Sep 17 00:00:00 2001 From: CGH0S7 Date: Tue, 13 Jan 2026 18:15:49 +0000 Subject: [PATCH] feat: port GPU code to CUDA 13 and enable GPU computation Major changes: - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050) - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize() - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility - Fix duplicate function definitions (compare_result_gpu, SHStep) - Fix syntax error in bssn_step_gpu.C - Enable GPU calculation in AMSS_NCKU_Input.py - Successfully build ABEGPU executable --- AMSS_NCKU_Input.py | 8 ++-- AMSS_NCKU_source/bssn_gpu.cu | 56 +++++++++++++------------- AMSS_NCKU_source/bssn_gpu.h | 11 +++++ AMSS_NCKU_source/bssn_gpu_rhs_ss.cu | 62 ++++++++++++++--------------- AMSS_NCKU_source/bssn_step_gpu.C | 5 ++- AMSS_NCKU_source/makefile.inc | 9 +++-- 6 files changed, 83 insertions(+), 68 deletions(-) diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py index 9799f00..d8395a0 100755 --- a/AMSS_NCKU_Input.py +++ b/AMSS_NCKU_Input.py @@ -18,10 +18,10 @@ Output_directory = "binary_output" ## binary data file directory ## The file directory name should not be too long MPI_processes = 96 ## number of mpi processes used in the simulation -GPU_Calculation = "no" ## Use GPU or not - ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) -CPU_Part = 1.0 -GPU_Part = 0.0 +GPU_Calculation = "yes" ## Use GPU or not + ## GPU support has been updated for CUDA 13 +CPU_Part = 0.0 +GPU_Part = 1.0 ################################################# diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index e67ae18..3b75c5e 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -18,7 +18,7 @@ using namespace std; #include #endif -void compare_result_gpu(int ftag1,double * datac,int data_num){ +static void compare_result_gpu(int ftag1,double * datac,int data_num){ double * data = (double*)malloc(sizeof(double)*data_num); cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost); compare_result(ftag1,data,data_num); @@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){ double * trA = M_ chin1; enforce_ga<<>>(trA); cudaMemset(trA,0,matrix_size * sizeof(double)); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double)); //trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz @@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub #endif //ifdef Vertex inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){ sub_symmetry_bd_partF<<>>(ord,func,funcc); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_partI<<>>(ord,func,funcc,SoA[0]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_partJ<<>>(ord,func,funcc,SoA[1]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_partK<<>>(ord,func,funcc,SoA[2]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } @@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double)); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_part1<<>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz ) @@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz, cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double)); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_part1<<>>(f,fh,fx,fy,fz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void computeRicci_part1(double * dst) @@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst) inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta) { sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci_part1<<>>(dst); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); }/*Exception*/ @@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs) inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA) { sub_symmetry_bd(3,f,fh,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_kodis_part1<<>>(f,fh,f_rhs); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz) @@ -617,9 +617,9 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){ sub_symmetry_bd(3,f,fh,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_lopsided_part1<<>>(f,fh,f_rhs,Sfx,Sfy,Sfz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void compute_rhs_bssn_part1() @@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, #ifdef TIMING1 - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"TIME USED"<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas); @@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa); compute_rhs_bssn_part2<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas); @@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa); compute_rhs_bssn_part3<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta); computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta); @@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta); computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); compute_rhs_bssn_part4<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); compute_rhs_bssn_part5<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); compute_rhs_bssn_part6<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); @@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, if(co == 0){ compute_rhs_bssn_part7<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas); @@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); compute_rhs_bssn_part8<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } #if (ABV == 1) @@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, //-------------------FOR GPU TEST---------------------- //----------------------------------------------------- #ifdef TIMING - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"MPI rank is: "<>>(ord,func,funcc); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_ss_partI<<>>(ord,func,funcc,SoA[0]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_ss_partJ<<>>(ord,func,funcc,SoA[1]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){ @@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f //cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double)); //cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double)); sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(0,fh,h_3D_SIZE[2]); sub_fderivs_sh<<>>(fh,Msh_ gx,Msh_ gy,Msh_ gz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc_part1<<>>(fx,fy,fz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,fx,h_3D_SIZE[0]); //compare_result_gpu(2,fy,h_3D_SIZE[0]); //compare_result_gpu(3,fz,h_3D_SIZE[0]); @@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh, //fderivs_sh sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,fh,h_3D_SIZE[2]); sub_fderivs_sh<<>>(fh,Msh_ gx,Msh_ gy,Msh_ gz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //fdderivs_sh sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(21,fh,h_3D_SIZE[2]); sub_fdderivs_sh<<>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); /*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]); compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]); compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]); @@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh, compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]); compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/ sub_fdderivs_shc_part1<<>>(fxx,fxy,fxz,fyy,fyz,fzz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); /*compare_result_gpu(1,fxx,h_3D_SIZE[0]); compare_result_gpu(2,fxy,h_3D_SIZE[0]); compare_result_gpu(3,fxz,h_3D_SIZE[0]); @@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst) inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta) { sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci_ss_part1<<>>(dst); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_lopsided_ss_part1(double * dst) @@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst) inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA) { sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_lopsided_ss_part1<<>>(dst); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs) @@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA } //compare_result_gpu(10,f,h_3D_SIZE[0]); sub_symmetry_bd_ss(3,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(0,fh,h_3D_SIZE[3]); sub_kodis_sh_part1<<>>(f,fh,f_rhs); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,f_rhs,h_3D_SIZE[0]); } @@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA) #ifdef TIMING1 - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"TIME USED"<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas); @@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa); compute_rhs_ss_part2<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas); @@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa); compute_rhs_ss_part3<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta); computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta); @@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA) computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta); computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta); computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); compute_rhs_ss_part4<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - //cudaThreadSynchronize(); + //cudaDeviceSynchronize(); //compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]); //compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]); //compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]); compute_rhs_ss_part5<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); compute_rhs_ss_part6<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); @@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA) } if(co == 0){ compute_rhs_ss_part7<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas); @@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); compute_rhs_ss_part8<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } #if (ABV == 1) @@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA) //test kodis //sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss); #ifdef TIMING - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"MPI rank is: "<next; } } -d +#endif // #if 0 #endif // withshell diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 90ad7b8..acfc38a 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -3,7 +3,7 @@ filein = -I/usr/include -I/usr/include/openmpi-x86_64 -I/usr/lib/gcc/x86_64-lin ##filein = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include -LDLIBS = -L/usr/lib64/openmpi/lib -Wl,-rpath,/usr/lib64/openmpi/lib -lmpi -lgfortran +LDLIBS = -L/usr/lib64/openmpi/lib -Wl,-rpath,/usr/lib64/openmpi/lib -lmpi -lgfortran -L/usr/local/cuda-13.1/lib64 -Wl,-rpath,/usr/local/cuda-13.1/lib64 -lcudart -lcuda ##LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran CXXAPPFLAGS = -O3 -Wno-deprecated -Dfortran3 -Dnewc @@ -15,7 +15,8 @@ CXX = g++ CC = gcc CLINKER = mpic++ -Cu = nvcc -CUDA_LIB_PATH = -L/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/lib64 -I/usr/include -I/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/include +Cu = /usr/local/cuda-13.1/bin/nvcc +CUDA_LIB_PATH = -L/usr/local/cuda-13.1/lib64 -I/usr/include -I/usr/local/cuda-13.1/include #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc -CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc +# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9) +CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc