diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 22bc8dc..931d3a0 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -64,6 +64,29 @@ int read_forced_device_from_env() return -1; } +int read_positive_env_value(const char *key) +{ + const char *value = getenv(key); + if (!value || !*value) + return -1; + + const int parsed = atoi(value); + return parsed > 0 ? parsed : -1; +} + +int clamp_launch_block_dim(int requested, int max_threads_per_block) +{ + if (requested <= 0) + return 0; + if (max_threads_per_block > 0 && requested > max_threads_per_block) + requested = max_threads_per_block; + + requested = (requested / 32) * 32; + if (requested <= 0) + requested = max_threads_per_block >= 32 ? 32 : max_threads_per_block; + return requested > 0 ? requested : BLOCK_DIM; +} + int select_cuda_device_for_process(int mpi_rank) { static int cached_device = -2; @@ -141,6 +164,19 @@ struct GpuRhsCache int mapped_buffer_count = 0; }; +struct GpuRhsLaunchConfig +{ + int device = -1; + int sm_count = 0; + int max_threads_per_block = 1024; + int grid_dim = GRID_DIM; + int block_dim = BLOCK_DIM; + int step_size = GRID_DIM * BLOCK_DIM; + int env_grid_dim = -1; + int env_block_dim = -1; + bool env_loaded = false; +}; + struct ExternalBufferRegistry { static const int max_mapped_buffers = 4096; @@ -175,6 +211,12 @@ GpuRhsCache &gpu_rhs_cache() return cache; } +GpuRhsLaunchConfig &gpu_rhs_launch_config() +{ + static GpuRhsLaunchConfig config; + return config; +} + ExternalBufferRegistry &external_buffer_registry() { static thread_local ExternalBufferRegistry registry; @@ -198,6 +240,16 @@ void reset_meta(Meta *meta) memset(meta, 0, sizeof(Meta)); } +int gpu_rhs_grid_dim() +{ + return gpu_rhs_launch_config().grid_dim; +} + +int gpu_rhs_block_dim() +{ + return gpu_rhs_launch_config().block_dim; +} + void reset_buffer_map(GpuRhsCache &cache) { cache.mapped_buffer_count = 0; @@ -652,6 +704,74 @@ void ensure_gpu_rhs_invariant_symbols() initialized = true; } +bool ensure_gpu_rhs_launch_symbols(int device, int matrix_size) +{ + GpuRhsLaunchConfig &config = gpu_rhs_launch_config(); + if (!config.env_loaded) + { + config.env_grid_dim = read_positive_env_value("AMSS_GPU_GRID_DIM"); + config.env_block_dim = read_positive_env_value("AMSS_GPU_BLOCK_DIM"); + config.env_loaded = true; + } + + if (config.device != device || config.sm_count <= 0) + { + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, device); + if (err != cudaSuccess) + { + cerr << "cudaGetDeviceProperties(" << device << ") failed: " + << cudaGetErrorString(err) << endl; + return false; + } + + config.device = device; + config.sm_count = prop.multiProcessorCount; + config.max_threads_per_block = prop.maxThreadsPerBlock; + } + + int block_dim = clamp_launch_block_dim(config.env_block_dim > 0 ? config.env_block_dim : 256, + config.max_threads_per_block); + if (block_dim <= 0) + block_dim = BLOCK_DIM; + + int grid_dim = 1; + if (config.env_grid_dim > 0) + { + grid_dim = config.env_grid_dim; + } + else + { + int needed_blocks = (matrix_size + block_dim - 1) / block_dim; + int grid_cap = config.sm_count > 0 ? config.sm_count * 4 : GRID_DIM; + if (grid_cap < 64) + grid_cap = 64; + if (grid_cap > 512) + grid_cap = 512; + grid_dim = needed_blocks < grid_cap ? needed_blocks : grid_cap; + } + + if (grid_dim <= 0) + grid_dim = 1; + + const int step_size = grid_dim * block_dim; + if (config.step_size != step_size) + { + cudaError_t err = cudaMemcpyToSymbol(STEP_SIZE, &step_size, sizeof(int)); + if (err != cudaSuccess) + { + cerr << "cudaMemcpyToSymbol(STEP_SIZE) failed: " + << cudaGetErrorString(err) << endl; + return false; + } + config.step_size = step_size; + } + + config.grid_dim = grid_dim; + config.block_dim = block_dim; + return true; +} + bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) { register_gpu_rhs_cleanup(); @@ -1318,7 +1438,7 @@ __global__ void enforce_ga(double * trA){ } inline void sub_enforce_ga(double *trA, int matrix_size){ - enforce_ga<<>>(trA); + enforce_ga<<>>(trA); cudaMemset(trA,0,matrix_size * sizeof(double)); //cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double)); @@ -1508,10 +1628,10 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub #endif //ifdef Cell #endif //ifdef Vertex inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){ - sub_symmetry_bd_partF<<>>(ord,func,funcc); - sub_symmetry_bd_partI<<>>(ord,func,funcc,SoA[0]); - sub_symmetry_bd_partJ<<>>(ord,func,funcc,SoA[1]); - sub_symmetry_bd_partK<<>>(ord,func,funcc,SoA[2]); + sub_symmetry_bd_partF<<>>(ord,func,funcc); + sub_symmetry_bd_partI<<>>(ord,func,funcc,SoA[0]); + sub_symmetry_bd_partJ<<>>(ord,func,funcc,SoA[1]); + sub_symmetry_bd_partK<<>>(ord,func,funcc,SoA[2]); } @@ -1610,7 +1730,7 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double)); - sub_fdderivs_part1<<>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz); + sub_fdderivs_part1<<>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz); } __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz ) @@ -1675,7 +1795,7 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz, cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double)); cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double)); - sub_fderivs_part1<<>>(f,fh,fx,fy,fz); + sub_fderivs_part1<<>>(f,fh,fx,fy,fz); } __global__ void computeRicci_part1(double * dst) @@ -1693,7 +1813,7 @@ __global__ void computeRicci_part1(double * dst) inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta) { sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); - computeRicci_part1<<>>(dst); + computeRicci_part1<<>>(dst); }/*Exception*/ @@ -1750,7 +1870,7 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs) inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA) { sub_symmetry_bd(3,f,fh,SoA); - sub_kodis_part1<<>>(f,fh,f_rhs); + sub_kodis_part1<<>>(f,fh,f_rhs); } __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz) @@ -1841,7 +1961,7 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){ sub_symmetry_bd(3,f,fh,SoA); - sub_lopsided_part1<<>>(f,fh,f_rhs,Sfx,Sfy,Sfz); + sub_lopsided_part1<<>>(f,fh,f_rhs,Sfx,Sfy,Sfz); } __global__ void compute_rhs_bssn_part1() @@ -3246,6 +3366,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, GpuRhsCache &cache = gpu_rhs_cache(); if (!prepare_gpu_rhs_cache(cache, device, ex)) return 1; + if (!ensure_gpu_rhs_launch_symbols(device, matrix_size)) + return 1; Meta * meta = &cache.meta; const int effective_co = (calledby == CALLED_BY_STEP) ? 1 : co; @@ -3922,7 +4044,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, //4.0------enforce_ga--------- sub_enforce_ga(Mh_ chin1, matrix_size); //4.1-----compute rhs--------- - compute_rhs_bssn_part1<<>>(); + compute_rhs_bssn_part1<<>>(); sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas); @@ -3937,7 +4059,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa); sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa); - compute_rhs_bssn_part2<<>>(); + compute_rhs_bssn_part2<<>>(); sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas); @@ -3946,7 +4068,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas); sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa); - compute_rhs_bssn_part3<<>>(); + compute_rhs_bssn_part3<<>>(); computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta); computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta); @@ -3955,19 +4077,19 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta); computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta); - compute_rhs_bssn_part4<<>>(); + compute_rhs_bssn_part4<<>>(); sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - compute_rhs_bssn_part5<<>>(); + compute_rhs_bssn_part5<<>>(); sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - compute_rhs_bssn_part6<<>>(); + compute_rhs_bssn_part6<<>>(); #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); - compute_rhs_bssn_part6_gauge<<>>(); + compute_rhs_bssn_part6_gauge<<>>(); #endif sub_lopsided(Mh_ gxx,Mh_ fh2,Mh_ gxx_rhs,Mh_ betax,Mh_ betay,Mh_ betaz,sss); @@ -4034,7 +4156,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, } if(effective_co == 0){ - compute_rhs_bssn_part7<<>>(); + compute_rhs_bssn_part7<<>>(); sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas); @@ -4042,7 +4164,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss); sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); - compute_rhs_bssn_part8<<>>(); + compute_rhs_bssn_part8<<>>(); } #if (ABV == 1)