Complete BSSN-EScalar CUDA resident transfers

This commit is contained in:
2026-05-05 23:57:42 +08:00
parent 85fe29cc2e
commit ae64a22178
5 changed files with 995 additions and 72 deletions

View File

@@ -502,7 +502,7 @@ static const int STAGE_SLOT_COUNT =
static constexpr int BSSN_STATE_COUNT = 24;
static constexpr int BSSN_MATTER_COUNT = 10;
static constexpr int BSSN_LK_FIELD_COUNT = 24;
static constexpr int BSSN_RESIDENT_BANK_COUNT = 4;
static constexpr int BSSN_RESIDENT_BANK_COUNT = 6;
static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT;
@@ -5285,11 +5285,24 @@ static bool resident_key_matches(const StepContext &ctx, int bank, double **host
static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count)
{
if (!host_key) return -1;
int best = -1;
unsigned long long best_age = 0;
int best_invalid = -1;
unsigned long long best_invalid_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (resident_key_matches_count(ctx, b, host_key, state_count))
return b;
if (!resident_key_matches_count(ctx, b, host_key, state_count))
continue;
if (ctx.resident_valid[b]) {
if (best < 0 || ctx.resident_age[b] > best_age) {
best = b;
best_age = ctx.resident_age[b];
}
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
best_invalid = b;
best_invalid_age = ctx.resident_age[b];
}
}
return -1;
return (best >= 0) ? best : best_invalid;
}
static int find_resident_bank_subset(const StepContext &ctx,
@@ -5299,6 +5312,10 @@ static int find_resident_bank_subset(const StepContext &ctx,
{
if (!host_key || !state_indices || subset_count <= 0)
return -1;
int best = -1;
unsigned long long best_age = 0;
int best_invalid = -1;
unsigned long long best_invalid_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
bool match = true;
for (int i = 0; i < subset_count; ++i) {
@@ -5310,10 +5327,19 @@ static int find_resident_bank_subset(const StepContext &ctx,
break;
}
}
if (match)
return b;
if (!match)
continue;
if (ctx.resident_valid[b]) {
if (best < 0 || ctx.resident_age[b] > best_age) {
best = b;
best_age = ctx.resident_age[b];
}
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
best_invalid = b;
best_invalid_age = ctx.resident_age[b];
}
}
return -1;
return (best >= 0) ? best : best_invalid;
}
static int find_resident_bank(const StepContext &ctx, double **host_key)
@@ -5373,6 +5399,16 @@ static void mark_resident_host_subset_clean(StepContext &ctx,
}
}
static void mark_resident_host_state_clean(StepContext &ctx,
int bank,
int state_index,
bool clean)
{
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
}
static void mark_resident_current_bank(StepContext &ctx, int bank)
{
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
@@ -5632,6 +5668,12 @@ static int reserve_escalar_resident_output_bank(StepContext &ctx,
static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
int avoid_a,
int avoid_b,
int avoid_c,
size_t all);
static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
double **host_key,
size_t all,
@@ -5658,7 +5700,7 @@ static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
}
}
if (bank < 0)
bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);
bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
ctx.resident_valid[bank] = false;
ctx.resident_age[bank] = ++ctx.resident_clock;
@@ -5734,12 +5776,140 @@ static int reserve_resident_output_bank_avoiding(StepContext &ctx,
return bank;
}
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
int avoid_a,
int avoid_b,
int avoid_c,
size_t all)
{
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
return b;
}
int best = -1;
unsigned long long best_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
if (best < 0 || ctx.resident_age[b] < best_age) {
best = b;
best_age = ctx.resident_age[b];
}
}
if (best < 0)
return choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);
writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
ctx.resident_valid[best] = false;
ctx.resident_host[best].fill(nullptr);
ctx.resident_host_clean[best].fill(0);
ctx.resident_age[best] = 0;
if (ctx.current_bank == best) {
ctx.current_bank = -1;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_curr.fill(nullptr);
}
update_state_ready(ctx);
return best;
}
static int ensure_resident_bank_avoiding(StepContext &ctx,
double **host_key,
size_t all,
bool upload_if_missing,
int avoid_a,
int avoid_b,
int avoid_c)
{
if (!resident_key_usable(host_key)) {
if (ctx.current_bank >= 0)
return ctx.current_bank;
return 0;
}
int bank = find_resident_bank(ctx, host_key);
if (bank >= 0) {
ctx.resident_age[bank] = ++ctx.resident_clock;
if (!ctx.resident_valid[bank] && upload_if_missing) {
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(host_key, all);
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
}
return bank;
}
bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key(ctx, bank, host_key);
if (upload_if_missing) {
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(host_key, all);
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
} else {
ctx.resident_valid[bank] = false;
set_resident_host_clean(ctx, bank, false);
}
update_state_ready(ctx);
return bank;
}
static int ensure_escalar_resident_bank_avoiding(StepContext &ctx,
double **host_key,
size_t all,
bool upload_if_missing,
int avoid_a,
int avoid_b,
int avoid_c)
{
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
if (ctx.current_bank >= 0)
return ctx.current_bank;
return 0;
}
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
if (bank >= 0) {
ctx.resident_age[bank] = ++ctx.resident_clock;
if (!ctx.resident_valid[bank] && upload_if_missing) {
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(host_key, all);
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
}
return bank;
}
bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
if (upload_if_missing) {
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(host_key, all);
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
} else {
ctx.resident_valid[bank] = false;
set_resident_host_clean(ctx, bank, false);
}
update_state_ready(ctx);
return bank;
}
static int active_or_keyed_bank(StepContext &ctx,
double **host_key,
size_t all,
bool upload_if_missing)
bool upload_if_missing,
int state_count = BSSN_STATE_COUNT)
{
if (resident_key_usable(host_key)) {
if (state_count == BSSN_ESCALAR_STATE_COUNT &&
resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
int bank = ensure_escalar_resident_bank(ctx, host_key, all, upload_if_missing);
mark_resident_current_bank(ctx, bank);
return bank;
}
if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) {
int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
mark_resident_current_bank(ctx, bank);
return bank;
@@ -6200,6 +6370,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
if (use_resident_state && input_bank >= 0)
set_resident_host_clean(ctx, input_bank, false);
}
if (RK4 == 0) {
@@ -6957,9 +7129,10 @@ static void copy_state_region_cuda(void *block_tag,
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank);
mark_resident_host_state_clean(ctx, bank, state_index, false);
update_state_ready(ctx);
} else {
ctx.resident_host_clean[bank][state_index] = 1;
mark_resident_host_state_clean(ctx, bank, state_index, true);
}
}
@@ -6970,9 +7143,11 @@ static void copy_state_region_packed_cuda(void *block_tag,
int i0, int j0, int k0,
int sx, int sy, int sz,
cudaMemcpyKind kind,
double **state_host_key = nullptr)
double **state_host_key = nullptr,
int state_count = BSSN_STATE_COUNT)
{
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
if (sx <= 0 || sy <= 0 || sz <= 0) return;
const size_t src_pitch = (size_t)ex[0] * sizeof(double);
@@ -6980,7 +7155,8 @@ static void copy_state_region_packed_cuda(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
kind == cudaMemcpyHostToDevice);
kind == cudaMemcpyHostToDevice,
state_count);
double *base_mem = ctx.d_resident_mem[bank];
cudaMemcpy3DParms p = {};
@@ -7003,9 +7179,10 @@ static void copy_state_region_packed_cuda(void *block_tag,
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank);
mark_resident_host_state_clean(ctx, bank, state_index, false);
update_state_ready(ctx);
} else {
ctx.resident_host_clean[bank][state_index] = 1;
mark_resident_host_state_clean(ctx, bank, state_index, true);
}
}
@@ -7024,7 +7201,8 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
kind == cudaMemcpyHostToDevice);
kind == cudaMemcpyHostToDevice,
state_count);
double *base_mem = ctx.d_resident_mem[bank];
const int region_all = sx * sy * sz;
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
@@ -7057,6 +7235,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank);
mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, false);
update_state_ready(ctx);
}
}
@@ -7067,10 +7246,15 @@ static void download_resident_state_count(void *block_tag, int *ex, double **sta
const size_t bytes = all * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, all);
int bank = find_resident_bank_count(ctx, state_host_out, state_count);
bool bank_matches_output_key = (bank >= 0);
if (bank < 0) {
bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
}
mark_resident_current_bank(ctx, bank);
if (!bank_matches_output_key &&
resident_key_usable_count(state_host_out, state_count)) {
assign_resident_key_count(ctx, bank, state_host_out, state_count);
}
const bool profile = cuda_profile_enabled();
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
static int direct_download = -1;
@@ -7117,6 +7301,72 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
}
static void upload_resident_state_count(void *block_tag, int *ex, double **state_host_in, int state_count)
{
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
StepContext &ctx = ensure_step_ctx(block_tag, all);
int bank = -1;
if (state_count == BSSN_ESCALAR_STATE_COUNT) {
bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false);
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(state_host_in, all);
} else if (state_count == BSSN_STATE_COUNT) {
bank = ensure_resident_bank(ctx, state_host_in, all, false);
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(state_host_in, all);
} else {
return;
}
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
set_resident_host_clean(ctx, bank, true);
mark_resident_current_bank(ctx, bank);
update_state_ready(ctx);
}
static void keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count)
{
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
return;
auto it = g_step_ctx.find(block_tag);
if (it == g_step_ctx.end()) return;
StepContext &ctx = it->second;
const int keep_bank = find_resident_bank_count(ctx, state_host_key, state_count);
if (keep_bank < 0 || !ctx.resident_valid[keep_bank])
return;
auto keep_clean = ctx.resident_host_clean[keep_bank];
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
ctx.resident_valid[b] = false;
ctx.resident_host[b].fill(nullptr);
ctx.resident_host_clean[b].fill(0);
ctx.resident_age[b] = 0;
}
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
ctx.d_state_curr.fill(nullptr);
ctx.d_state_next.fill(nullptr);
ctx.current_bank = -1;
ctx.resident_clock = 0;
ctx.matter_ready = false;
for (int i = 0; i < state_count; ++i) {
ctx.resident_host[keep_bank][i] = state_host_key[i];
ctx.resident_host_clean[keep_bank][i] = keep_clean[i] ? 1 : 0;
}
ctx.resident_valid[keep_bank] = true;
ctx.resident_age[keep_bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, keep_bank);
(void)ex;
update_state_ready(ctx);
}
static bool download_resident_state_count_if_present(void *block_tag,
int *ex,
double **state_host_out,
@@ -7183,8 +7433,27 @@ static void copy_state_subset(void *block_tag,
const size_t bytes = all * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, all);
double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr;
const int bank = active_or_keyed_bank(ctx, full_key, all,
kind == cudaMemcpyHostToDevice);
int bank = -1;
if (state_host) {
if (full_key) {
bank = (subset_count == BSSN_ESCALAR_STATE_COUNT)
? find_resident_bank_count(ctx, full_key, BSSN_ESCALAR_STATE_COUNT)
: find_resident_bank(ctx, full_key);
} else {
bank = find_resident_bank_subset(ctx, state_host, state_indices, subset_count);
}
if (kind == cudaMemcpyDeviceToHost &&
(bank < 0 || !ctx.resident_valid[bank])) {
bank = -1;
}
}
if (bank < 0) {
bank = active_or_keyed_bank(ctx, full_key, all,
kind == cudaMemcpyHostToDevice,
subset_count);
} else {
mark_resident_current_bank(ctx, bank);
}
double *base_mem = ctx.d_resident_mem[bank];
int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY];
@@ -7834,6 +8103,52 @@ int bssn_escalar_cuda_download_resident_state(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_upload_resident_state_count(void *block_tag,
int *ex,
double **state_host_in,
int state_count)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
return 1;
upload_resident_state_count(block_tag, ex, state_host_in, state_count);
return 0;
}
extern "C"
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
int *ex,
double **state_host_in)
{
return bssn_cuda_upload_resident_state_count(block_tag, ex, state_host_in,
BSSN_ESCALAR_STATE_COUNT);
}
extern "C"
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
return 1;
keep_only_resident_state_count(block_tag, ex, state_host_key, state_count);
return 0;
}
extern "C"
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
int *ex,
double **state_host_key)
{
return bssn_cuda_keep_only_resident_state_count(block_tag, ex, state_host_key,
BSSN_ESCALAR_STATE_COUNT);
}
extern "C"
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex,
@@ -8032,6 +8347,28 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (!state_host_key ||
(state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT))
return 1;
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
i0, j0, k0, sx, sy, sz,
cudaMemcpyHostToDevice,
state_host_key, state_count);
return 0;
}
extern "C"
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count,
@@ -8115,7 +8452,8 @@ static void copy_state_device_batch(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
pack_not_unpack == 0 || state_host_key != nullptr);
pack_not_unpack == 0 || state_host_key != nullptr,
state_count);
double *base_mem = ctx.d_resident_mem[bank];
const int region_all = sx * sy * sz;
dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8164,7 +8502,8 @@ static void copy_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
pack_not_unpack == 0 || state_host_key != nullptr);
pack_not_unpack == 0 || state_host_key != nullptr,
state_count);
double *base_mem = ctx.d_resident_mem[bank];
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
@@ -8187,6 +8526,7 @@ static void copy_state_device_segments(void *block_tag,
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank);
set_resident_host_clean(ctx, bank, false);
update_state_ready(ctx);
}
}
@@ -8214,7 +8554,8 @@ static void restrict_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
state_host_key != nullptr);
state_host_key != nullptr,
state_count);
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
(size_t)segment_count * 8 * sizeof(int),
@@ -8253,7 +8594,8 @@ static void prolong_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
state_host_key != nullptr);
state_host_key != nullptr,
state_count);
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
(size_t)segment_count * 11 * sizeof(int),
@@ -8498,7 +8840,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true);
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
const int region_all = sx * sy * sz;
upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8555,7 +8897,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true);
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
const int region_all = sx * sy * sz;
upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8650,7 +8992,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true);
src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank);
src3_bank = (source_count == 3)
? ensure_escalar_resident_bank(ctx, src3_host_key, all, true, src1_bank)
? ensure_escalar_resident_bank_avoiding(ctx, src3_host_key, all, true,
src1_bank, src2_bank, -1)
: -1;
dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
src1_bank, src2_bank, src3_bank);
@@ -8658,7 +9001,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
src3_bank = (source_count == 3)
? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank)
? ensure_resident_bank_avoiding(ctx, src3_host_key, all, true,
src1_bank, src2_bank, -1)
: -1;
dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
src1_bank, src2_bank, src3_bank);