Cache GPU main-path device buffers
This commit is contained in:
@@ -33,15 +33,43 @@ struct DeviceArrays
|
||||
double *d = nullptr;
|
||||
};
|
||||
|
||||
inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
||||
struct CachedBuffer
|
||||
{
|
||||
cudaError_t err = cudaMalloc(&dst, bytes);
|
||||
double *ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
return true;
|
||||
|
||||
if (buffer.ptr)
|
||||
{
|
||||
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||
if (free_err != cudaSuccess)
|
||||
report_cuda_error("cudaFree", free_err);
|
||||
buffer.ptr = nullptr;
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMalloc", err);
|
||||
return false;
|
||||
}
|
||||
err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
|
||||
|
||||
buffer.capacity = bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
||||
{
|
||||
if (!ensure_capacity(dst, bytes))
|
||||
return false;
|
||||
|
||||
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D)", err);
|
||||
@@ -50,12 +78,6 @@ inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void free_device(double *ptr)
|
||||
{
|
||||
if (ptr)
|
||||
cudaFree(ptr);
|
||||
}
|
||||
|
||||
__global__ void enforce_ga_kernel(int n,
|
||||
double *dxx, double *gxy, double *gxz,
|
||||
double *dyy, double *gyz, double *dzz,
|
||||
@@ -376,31 +398,37 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
double *Axx, double *Axy, double *Axz,
|
||||
double *Ayy, double *Ayz, double *Azz)
|
||||
{
|
||||
struct EnforceGaCache
|
||||
{
|
||||
CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
|
||||
CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
|
||||
};
|
||||
static thread_local EnforceGaCache cache;
|
||||
|
||||
int n = count_points(ex);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
|
||||
double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
|
||||
double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
|
||||
double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
|
||||
|
||||
bool ok = copy_to_device(d_dxx, dxx, bytes) &&
|
||||
copy_to_device(d_gxy, gxy, bytes) &&
|
||||
copy_to_device(d_gxz, gxz, bytes) &&
|
||||
copy_to_device(d_dyy, dyy, bytes) &&
|
||||
copy_to_device(d_gyz, gyz, bytes) &&
|
||||
copy_to_device(d_dzz, dzz, bytes) &&
|
||||
copy_to_device(d_Axx, Axx, bytes) &&
|
||||
copy_to_device(d_Axy, Axy, bytes) &&
|
||||
copy_to_device(d_Axz, Axz, bytes) &&
|
||||
copy_to_device(d_Ayy, Ayy, bytes) &&
|
||||
copy_to_device(d_Ayz, Ayz, bytes) &&
|
||||
copy_to_device(d_Azz, Azz, bytes);
|
||||
bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
|
||||
copy_to_device(cache.gxy, gxy, bytes) &&
|
||||
copy_to_device(cache.gxz, gxz, bytes) &&
|
||||
copy_to_device(cache.dyy, dyy, bytes) &&
|
||||
copy_to_device(cache.gyz, gyz, bytes) &&
|
||||
copy_to_device(cache.dzz, dzz, bytes) &&
|
||||
copy_to_device(cache.Axx, Axx, bytes) &&
|
||||
copy_to_device(cache.Axy, Axy, bytes) &&
|
||||
copy_to_device(cache.Axz, Axz, bytes) &&
|
||||
copy_to_device(cache.Ayy, Ayy, bytes) &&
|
||||
copy_to_device(cache.Ayz, Ayz, bytes) &&
|
||||
copy_to_device(cache.Azz, Azz, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
|
||||
double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
|
||||
double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
|
||||
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
|
||||
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
||||
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
||||
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
|
||||
@@ -408,27 +436,22 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
||||
}
|
||||
|
||||
free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
|
||||
free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
|
||||
free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
|
||||
free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -446,6 +469,19 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
int lev,
|
||||
int rk_stage)
|
||||
{
|
||||
struct Rk4BoundaryCache
|
||||
{
|
||||
CachedBuffer X, Y, Z;
|
||||
CachedBuffer state0, boundary, stage, rhs;
|
||||
const double *host_X = nullptr;
|
||||
const double *host_Y = nullptr;
|
||||
const double *host_Z = nullptr;
|
||||
int nx = 0;
|
||||
int ny = 0;
|
||||
int nz = 0;
|
||||
};
|
||||
static thread_local Rk4BoundaryCache cache;
|
||||
|
||||
int nx = ex[0];
|
||||
int ny = ex[1];
|
||||
int nz = ex[2];
|
||||
@@ -457,23 +493,32 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
|
||||
double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
|
||||
bool ok = true;
|
||||
if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||
cache.nx != nx || cache.ny != ny || cache.nz != nz)
|
||||
{
|
||||
ok = copy_to_device(cache.X, X, bytes_x) &&
|
||||
copy_to_device(cache.Y, Y, bytes_y) &&
|
||||
copy_to_device(cache.Z, Z, bytes_z);
|
||||
if (ok)
|
||||
{
|
||||
cache.host_X = X;
|
||||
cache.host_Y = Y;
|
||||
cache.host_Z = Z;
|
||||
cache.nx = nx;
|
||||
cache.ny = ny;
|
||||
cache.nz = nz;
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = copy_to_device(d_X, X, bytes_x) &&
|
||||
copy_to_device(d_Y, Y, bytes_y) &&
|
||||
copy_to_device(d_Z, Z, bytes_z) &&
|
||||
copy_to_device(d_state0, state0, bytes) &&
|
||||
copy_to_device(d_boundary, boundary_src, bytes) &&
|
||||
copy_to_device(d_stage, stage_data, bytes) &&
|
||||
copy_to_device(d_rhs, rhs_accum, bytes);
|
||||
ok = ok &&
|
||||
copy_to_device(cache.state0, state0, bytes) &&
|
||||
copy_to_device(cache.boundary, boundary_src, bytes) &&
|
||||
copy_to_device(cache.stage, stage_data, bytes) &&
|
||||
copy_to_device(cache.rhs, rhs_accum, bytes);
|
||||
|
||||
if (!ok)
|
||||
{
|
||||
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double dX = X[1] - X[0];
|
||||
double dY = Y[1] - Y[0];
|
||||
@@ -498,6 +543,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
||||
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
||||
|
||||
double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
|
||||
double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
|
||||
double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
|
||||
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
|
||||
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
||||
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
||||
@@ -513,12 +561,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev > 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr;
|
||||
void *args[] = {&nx, &ny, &nz,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
@@ -528,45 +578,43 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (ok)
|
||||
{
|
||||
err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
|
||||
err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
{
|
||||
static thread_local CachedBuffer d_chi;
|
||||
|
||||
int n = count_points(ex);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_chi = nullptr;
|
||||
bool ok = copy_to_device(d_chi, chi, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
void *args[] = {&n, &d_chi, &tinny};
|
||||
double *ptr = d_chi.ptr;
|
||||
void *args[] = {&n, &ptr, &tinny};
|
||||
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
|
||||
free_device(d_chi);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -1,23 +1,26 @@
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <sys/time.h>
|
||||
#include <cuda.h>
|
||||
//#include "cutil.h"
|
||||
#include <nvrtc.h>
|
||||
#include <cuda_runtime.h>
|
||||
using namespace std;
|
||||
using namespace std;
|
||||
|
||||
//includes, bssn
|
||||
#include "gpu_mem.h"
|
||||
#include "bssn_gpu.h"
|
||||
#ifdef RESULT_CHECK
|
||||
#include <fstream>
|
||||
#endif
|
||||
|
||||
#include "bssn_gpu.h"
|
||||
#ifdef RESULT_CHECK
|
||||
#include <fstream>
|
||||
#endif
|
||||
|
||||
void destroy_meta(Meta *meta);
|
||||
|
||||
void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
#ifdef RESULT_CHECK
|
||||
double * data = (double*)malloc(sizeof(double)*data_num);
|
||||
@@ -30,8 +33,426 @@ void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
(void)data_num;
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void test_const_address(double * testd){
|
||||
|
||||
namespace {
|
||||
|
||||
int read_local_rank_from_env()
|
||||
{
|
||||
const char *keys[] = {
|
||||
"AMSS_NCKU_CUDA_LOCAL_RANK",
|
||||
"I_MPI_LOCAL_RANK",
|
||||
"OMPI_COMM_WORLD_LOCAL_RANK",
|
||||
"MPI_LOCALRANKID",
|
||||
"PMI_LOCAL_RANK",
|
||||
"SLURM_LOCALID"
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(keys) / sizeof(keys[0]); ++i)
|
||||
{
|
||||
const char *value = getenv(keys[i]);
|
||||
if (value && *value)
|
||||
return atoi(value);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int read_forced_device_from_env()
|
||||
{
|
||||
const char *value = getenv("AMSS_NCKU_CUDA_DEVICE");
|
||||
if (value && *value)
|
||||
return atoi(value);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int select_cuda_device_for_process(int mpi_rank)
|
||||
{
|
||||
static int cached_device = -2;
|
||||
if (cached_device >= -1)
|
||||
return cached_device;
|
||||
|
||||
int device_count = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&device_count);
|
||||
if (err != cudaSuccess || device_count <= 0)
|
||||
{
|
||||
cached_device = -1;
|
||||
return cached_device;
|
||||
}
|
||||
|
||||
int device = read_forced_device_from_env();
|
||||
if (device < 0)
|
||||
{
|
||||
int local_rank = read_local_rank_from_env();
|
||||
if (local_rank < 0)
|
||||
local_rank = mpi_rank;
|
||||
device = local_rank % device_count;
|
||||
}
|
||||
|
||||
if (device < 0)
|
||||
device = 0;
|
||||
if (device >= device_count)
|
||||
device %= device_count;
|
||||
|
||||
err = cudaSetDevice(device);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaSetDevice(" << device << ") failed: "
|
||||
<< cudaGetErrorString(err) << endl;
|
||||
cached_device = -1;
|
||||
return cached_device;
|
||||
}
|
||||
|
||||
cached_device = device;
|
||||
return cached_device;
|
||||
}
|
||||
|
||||
struct BufferSpec
|
||||
{
|
||||
double **slot;
|
||||
size_t count;
|
||||
};
|
||||
|
||||
struct CopySpec
|
||||
{
|
||||
double *dst;
|
||||
const double *src;
|
||||
size_t count;
|
||||
};
|
||||
|
||||
struct ZeroSpec
|
||||
{
|
||||
double *ptr;
|
||||
size_t count;
|
||||
};
|
||||
|
||||
struct GpuRhsCache
|
||||
{
|
||||
Meta meta{};
|
||||
int ex[3] = {0, 0, 0};
|
||||
int matrix_size = 0;
|
||||
int device = -1;
|
||||
bool allocated = false;
|
||||
const double *last_x = nullptr;
|
||||
const double *last_y = nullptr;
|
||||
const double *last_z = nullptr;
|
||||
};
|
||||
|
||||
GpuRhsCache &gpu_rhs_cache()
|
||||
{
|
||||
static GpuRhsCache cache;
|
||||
return cache;
|
||||
}
|
||||
|
||||
void reset_meta(Meta *meta)
|
||||
{
|
||||
memset(meta, 0, sizeof(Meta));
|
||||
}
|
||||
|
||||
bool ensure_device_buffer(double **ptr, size_t count)
|
||||
{
|
||||
if (*ptr)
|
||||
return true;
|
||||
|
||||
cudaError_t err = cudaMalloc((void **)ptr, count * sizeof(double));
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool allocate_buffers(const BufferSpec *specs, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
if (!ensure_device_buffer(specs[i].slot, specs[i].count))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool copy_buffers_to_device(const CopySpec *specs, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(specs[i].dst, specs[i].src,
|
||||
specs[i].count * sizeof(double),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool zero_buffers(const ZeroSpec *specs, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
cudaError_t err = cudaMemset(specs[i].ptr, 0, specs[i].count * sizeof(double));
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMemset failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void cleanup_gpu_rhs_cache()
|
||||
{
|
||||
GpuRhsCache &cache = gpu_rhs_cache();
|
||||
if (!cache.allocated)
|
||||
return;
|
||||
|
||||
if (cache.device >= 0)
|
||||
cudaSetDevice(cache.device);
|
||||
destroy_meta(&cache.meta);
|
||||
reset_meta(&cache.meta);
|
||||
cache.ex[0] = cache.ex[1] = cache.ex[2] = 0;
|
||||
cache.matrix_size = 0;
|
||||
cache.device = -1;
|
||||
cache.allocated = false;
|
||||
cache.last_x = nullptr;
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
}
|
||||
|
||||
bool register_gpu_rhs_cleanup()
|
||||
{
|
||||
static bool registered = false;
|
||||
if (!registered)
|
||||
{
|
||||
atexit(cleanup_gpu_rhs_cache);
|
||||
registered = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
|
||||
{
|
||||
register_gpu_rhs_cleanup();
|
||||
|
||||
const bool shape_changed =
|
||||
!cache.allocated ||
|
||||
cache.device != device ||
|
||||
cache.ex[0] != ex[0] ||
|
||||
cache.ex[1] != ex[1] ||
|
||||
cache.ex[2] != ex[2];
|
||||
|
||||
if (!shape_changed)
|
||||
return true;
|
||||
|
||||
if (cache.allocated)
|
||||
{
|
||||
if (cache.device >= 0)
|
||||
cudaSetDevice(cache.device);
|
||||
destroy_meta(&cache.meta);
|
||||
reset_meta(&cache.meta);
|
||||
}
|
||||
|
||||
cache.device = device;
|
||||
cache.ex[0] = ex[0];
|
||||
cache.ex[1] = ex[1];
|
||||
cache.ex[2] = ex[2];
|
||||
cache.matrix_size = ex[0] * ex[1] * ex[2];
|
||||
cache.last_x = nullptr;
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
|
||||
Meta *meta = &cache.meta;
|
||||
const int matrix_size = cache.matrix_size;
|
||||
const size_t fh_size = static_cast<size_t>(ex[0] + 2) * static_cast<size_t>(ex[1] + 2) * static_cast<size_t>(ex[2] + 2);
|
||||
const size_t fh2_size = static_cast<size_t>(ex[0] + 3) * static_cast<size_t>(ex[1] + 3) * static_cast<size_t>(ex[2] + 3);
|
||||
|
||||
const BufferSpec buffers[] = {
|
||||
{&meta->X, static_cast<size_t>(ex[0])},
|
||||
{&meta->Y, static_cast<size_t>(ex[1])},
|
||||
{&meta->Z, static_cast<size_t>(ex[2])},
|
||||
{&meta->chi, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->trK, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ayz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ayy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Azz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Lap, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betax, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betay, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betaz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->chi_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->trK_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxx_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gzz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axx_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Axz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ayy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ayz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Azz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamx_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Lap_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betax_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betay_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betaz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfx_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfy_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->dtSfz_rhs, static_cast<size_t>(matrix_size)},
|
||||
{&meta->rho, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Sxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Syy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Syz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Szz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Rxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Rxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Rxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ryy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ryz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Rzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->ham_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->movx_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->movy_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->movz_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gmx_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gmy_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gmz_Res, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->chix, static_cast<size_t>(matrix_size)},
|
||||
{&meta->chiy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->chiz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxyx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxzx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyyx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyzx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gzzx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxzy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyzy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gzzy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gxzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gyzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gzzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Lapx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Lapy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Lapz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betaxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betaxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betaxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betayy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betayz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betazz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betayx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betazy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->betazx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Kx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Ky, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Kz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamyx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamzx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->div_beta, static_cast<size_t>(matrix_size)},
|
||||
{&meta->S, static_cast<size_t>(matrix_size)},
|
||||
{&meta->f, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupxx, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupxy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupxz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupyy, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupyz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->gupzz, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamxa, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamya, static_cast<size_t>(matrix_size)},
|
||||
{&meta->Gamza, static_cast<size_t>(matrix_size)},
|
||||
{&meta->alpn1, static_cast<size_t>(matrix_size)},
|
||||
{&meta->chin1, static_cast<size_t>(matrix_size)},
|
||||
{&meta->fh, fh_size},
|
||||
{&meta->fh2, fh2_size},
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
|
||||
{&meta->reta, static_cast<size_t>(matrix_size)},
|
||||
#endif
|
||||
};
|
||||
|
||||
if (!allocate_buffers(buffers, sizeof(buffers) / sizeof(buffers[0])))
|
||||
{
|
||||
destroy_meta(meta);
|
||||
reset_meta(meta);
|
||||
cache.allocated = false;
|
||||
cache.last_x = nullptr;
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
return false;
|
||||
}
|
||||
|
||||
cache.allocated = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
__global__ void test_const_address(double * testd){
|
||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
if(_t == 0)
|
||||
testd[0] = F1o3;
|
||||
@@ -2010,39 +2431,30 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
double *ham_Res, double *movx_Res, double *movy_Res,double * movz_Res,
|
||||
double * Gmx_Res, double *Gmy_Res,double * Gmz_Res ,
|
||||
int & Symmetry,int &Lev, double &eps, int &co)
|
||||
{
|
||||
//#1------------init gpu meta data---------------------
|
||||
//cout<<"init GPU meta data\n";
|
||||
|
||||
#ifdef DEVICE_ID
|
||||
// which device to use
|
||||
cudaSetDevice(DEVICE_ID);
|
||||
#endif
|
||||
|
||||
#ifdef DEVICE_ID_BY_PID
|
||||
pid_t pid = getpid();
|
||||
cudaSetDevice(pid % 2);
|
||||
cout<<"My pid= "<<pid<<endl;
|
||||
#endif
|
||||
|
||||
#ifdef DEVICE_ID_BY_MPI_RANK
|
||||
cudaSetDevice(mpi_rank % 2);
|
||||
#endif
|
||||
|
||||
#ifdef TIMING
|
||||
{
|
||||
//#1------------init gpu meta data---------------------
|
||||
//cout<<"init GPU meta data\n";
|
||||
|
||||
const int device = select_cuda_device_for_process(mpi_rank);
|
||||
if (device < 0)
|
||||
return 1;
|
||||
|
||||
#ifdef TIMING
|
||||
struct timeval tvStart, tvEnd;
|
||||
struct timeval tv1, tv2;
|
||||
gettimeofday(&tvStart, NULL );
|
||||
gettimeofday(&tv1, NULL );
|
||||
#endif
|
||||
|
||||
//int dim = 3;
|
||||
int matrix_size = ex[0] * ex[1] * ex[2];
|
||||
Meta met;
|
||||
Meta * meta = &met;
|
||||
//int dim = 3;
|
||||
int matrix_size = ex[0] * ex[1] * ex[2];
|
||||
GpuRhsCache &cache = gpu_rhs_cache();
|
||||
if (!prepare_gpu_rhs_cache(cache, device, ex))
|
||||
return 1;
|
||||
Meta * meta = &cache.meta;
|
||||
|
||||
/*
|
||||
//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
|
||||
/*
|
||||
//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
|
||||
|
||||
//1.1 inout
|
||||
CUDA_SAFE_CALL(cudaMalloc((void**)&(Mh_ X), ex[0] * sizeof(double)));
|
||||
@@ -2212,7 +2624,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
CUDA_SAFE_CALL(cudaMalloc((void**)&(Mh_ fh2), (ex[0]+3)*(ex[1]+3)*(ex[2]+3) * sizeof(double)));
|
||||
*/
|
||||
|
||||
//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
|
||||
#if 0
|
||||
//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
|
||||
|
||||
//1.1 inout
|
||||
cudaMalloc((void**)&(Mh_ X), ex[0] * sizeof(double));
|
||||
@@ -2396,114 +2809,133 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
|
||||
#endif
|
||||
|
||||
//2 ----------------Copy Data to Device------------------
|
||||
cudaMemcpy(Mh_ X, X, ex[0] * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Y, Y, ex[1] * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Z, Z, ex[2] * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ chi, chi, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dxx, dxx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dyy, dyy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dzz, dzz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ trK, trK, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ gxy, gxy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ gxz, gxz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ gyz, gyz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Axx, Axx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Axy, Axy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Axz, Axz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Ayz, Ayz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Ayy, Ayy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Azz, Azz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Gamx, Gamx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Gamy, Gamy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Gamz, Gamz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ betax, betax, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ betay, betay, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ betaz, betaz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ Lap, Lap, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dtSfx, dtSfx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dtSfy, dtSfy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(Mh_ dtSfz, dtSfz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
|
||||
cudaMemset(Mh_ rho,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Syz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Syy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Szz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Sz,0,matrix_size * sizeof(double));
|
||||
|
||||
//init local var
|
||||
cudaMemset(Mh_ gxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ chix,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ chiy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ chiz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxyx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxzx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyyx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyzx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gzzx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxzy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyzy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gzzy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxyz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gxzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyyz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gyzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gzzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Lapx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Lapy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Lapz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betaxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betaxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betaxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betayy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betayz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betazz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betayx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betazy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ betazx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Kx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Ky,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Kz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamyz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamyx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamzy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamzx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ div_beta,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ S,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ f,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fyz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ fzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupxy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupxz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupyy,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupyz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ gupzz,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamxa,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamya,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ Gamza,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ alpn1,0,matrix_size * sizeof(double));
|
||||
cudaMemset(Mh_ chin1,0,matrix_size * sizeof(double));
|
||||
#endif
|
||||
//2 ----------------Copy Data to Device------------------
|
||||
if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
|
||||
{
|
||||
const CopySpec coord_copies[] = {
|
||||
{Mh_ X, X, static_cast<size_t>(ex[0])},
|
||||
{Mh_ Y, Y, static_cast<size_t>(ex[1])},
|
||||
{Mh_ Z, Z, static_cast<size_t>(ex[2])},
|
||||
};
|
||||
if (!copy_buffers_to_device(coord_copies, sizeof(coord_copies) / sizeof(coord_copies[0])))
|
||||
return 1;
|
||||
cache.last_x = X;
|
||||
cache.last_y = Y;
|
||||
cache.last_z = Z;
|
||||
}
|
||||
|
||||
const CopySpec state_copies[] = {
|
||||
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dyy, dyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dzz, dzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ trK, trK, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxy, gxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxz, gxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyz, gyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axx, Axx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axy, Axy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axz, Axz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Ayz, Ayz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Ayy, Ayy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Azz, Azz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamx, Gamx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamy, Gamy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamz, Gamz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betax, betax, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betay, betay, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betaz, betaz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Lap, Lap, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfx, dtSfx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
|
||||
};
|
||||
if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
|
||||
return 1;
|
||||
|
||||
const ZeroSpec zero_specs[] = {
|
||||
{Mh_ rho, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Syz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Syy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Szz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Sz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ chix, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ chiy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ chiz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxyx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxzx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyyx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyzx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gzzx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxzy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyzy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gzzy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gzzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Lapx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Lapy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Lapz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betaxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betaxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betaxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betayy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betayz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betazz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betayx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betazy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betazx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Kx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Ky, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Kz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamyx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamzy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamzx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ div_beta, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ S, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ f, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ fzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gupzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamxa, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamya, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamza, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ alpn1, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ chin1, static_cast<size_t>(matrix_size)},
|
||||
};
|
||||
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
|
||||
return 1;
|
||||
|
||||
|
||||
double sss[3] = {1,1,1};
|
||||
@@ -2907,8 +3339,5 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
#endif
|
||||
|
||||
|
||||
destroy_meta(meta);
|
||||
|
||||
|
||||
return 0;//TODO return
|
||||
}
|
||||
return 0;//TODO return
|
||||
}
|
||||
|
||||
@@ -4,10 +4,8 @@
|
||||
#include "bssn_macro.h"
|
||||
#include "macrodef.fh"
|
||||
|
||||
#define DEVICE_ID 0
|
||||
// #define DEVICE_ID_BY_MPI_RANK
|
||||
#define GRID_DIM 256
|
||||
#define BLOCK_DIM 128
|
||||
#define GRID_DIM 256
|
||||
#define BLOCK_DIM 128
|
||||
|
||||
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
|
||||
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
|
||||
|
||||
Reference in New Issue
Block a user