Add full FD order support (2nd/4th/6th/8th) to C derivative kernels via ghost_width dispatch

Wrap each C kernel in #if (ghost_width == N) blocks matching Fortran stencil
coefficients from diff_new.f90, kodiss.f90, and lopsidediff.f90. Add fast-path
indexing for ord=1,4,5 in share_func.h.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-14 02:27:21 +08:00
parent 5d8dfaf679
commit c6ecc11d7e
6 changed files with 2612 additions and 877 deletions

View File

@@ -1,16 +1,16 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 kodis
* C 版 kodis — Kreiss-Oliger numerical dissipation (Cartesian patches).
*
* Fortran signature:
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
* The KO operator is (D₊D₋)^r applied to f_rhs with alternating sign (-1)^(r-1).
*
* 约定:
* X: ex1, Y: ex2, Z: ex3
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
* SoA[3]
* eps: double
* FD order → r → cof=2^(2r) mapping:
* ghost_width=2 (2nd) → r=2, cof=16, sign=-
* ghost_width=3 (4th) → r=3, cof=64, sign=+
* ghost_width=4 (6th) → r=4, cof=256, sign=-
* ghost_width=5 (8th) → r=5, cof=1024,sign=+
*/
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
@@ -18,100 +18,304 @@ void kodis(const int ex[3],
const double SoA[3],
int Symmetry, double eps)
{
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
const double cof = 64.0; // 2^6
const int NO_SYMM = 0, OCTANT = 2;
const double ZEO = 0.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
// Fortran: imax=ex(1) 等是 1-based 上界
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
// Fortran: imin=jmin=kmin=1某些对称情况变 -2
int iminF = 1, jminF = 1, kminF = 1;
#if (ghost_width == 2)
/* ---- r=2, cof=16, sign=-, 5pt stencil ----------------------------- */
{
const int ord = 2;
const int r = 2;
const double cof = 16.0;
const double F4 = 4.0, F6 = 6.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
// 分配 fh大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
const size_t nx = (size_t)ex1 + 3;
const size_t ny = (size_t)ex2 + 3;
const size_t nz = (size_t)ex3 + 3;
const size_t fh_size = nx * ny * nz;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
symmetry_bd(3, ex, f, fh, SoA);
symmetry_bd(ord, ex, f, fh, SoA);
/*
* Fortran loops:
* do k=1,ex3
* do j=1,ex2
* do i=1,ex1
*
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
* 并定义 Fortran index: iF=i0+1, ...
*/
// 收紧循环范围:只遍历满足 iF±3/jF±3/kF±3 条件的内部点
// iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
// iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
const int i0_hi = imaxF - 4; // inclusive
const int j0_hi = jmaxF - 4;
const int k0_hi = kmaxF - 4;
/* i±2 must be valid: i-2 >= iminF && i+2 <= imaxF
C 0-based: i0 >= iminF+1, i0 <= ex1-3 */
const int i0_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j0_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k0_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i0_hi = imaxF - 3;
const int j0_hi = jmaxF - 3;
const int k0_hi = kmaxF - 3;
if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
const double Dx = (
(fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 3)
/* ---- r=3, cof=64, sign=+, 7pt stencil (current default) ---------- */
{
const int ord = 3;
const int r = 3;
const double cof = 64.0;
const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
const int NO_SYMM = 0, OCTANT = 2;
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
const size_t p = idx_ex(i0, j0, k0, ex);
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
const double Dx_term =
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
const double Dy_term =
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
symmetry_bd(ord, ex, f, fh, SoA);
const double Dz_term =
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
const int i0_hi = imaxF - 4;
const int j0_hi = jmaxF - 4;
const int k0_hi = kmaxF - 4;
// Fortran:
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 2;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 2;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 2;
const size_t p = idx_ex(i0, j0, k0, ex);
const double Dx = (
(fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 4)
/* ---- r=4, cof=256, sign=-, 9pt stencil ---------------------------- */
{
const int ord = 4;
const int r = 4;
const double cof = 256.0;
const double F8 = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
free(fh);
}
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* i±4 valid: i-4>=iminF → i0>=iminF+3, i+4<=imaxF → i0<=ex1-5 */
const int i0_lo = (iminF + 3 > 0) ? iminF + 3 : 0;
const int j0_lo = (jminF + 3 > 0) ? jminF + 3 : 0;
const int k0_lo = (kminF + 3 > 0) ? kminF + 3 : 0;
const int i0_hi = imaxF - 5;
const int j0_hi = jmaxF - 5;
const int k0_hi = kmaxF - 5;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 3;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 3;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 3;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Stencil: [1,-8,28,-56,70,-56,28,-8,1] */
const double Dx = (
(fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]) +
F28* (fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]) -
F56* (fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]) +
F28* (fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]) -
F56* (fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]) +
F28* (fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]) -
F56* (fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 5)
/* ---- r=5, cof=1024, sign=+, 11pt stencil ------------------------- */
{
const int ord = 5;
const int r = 5;
const double cof = 1024.0;
const double F10 = 10.0, F45 = 45.0, F120 = 120.0;
const double F210 = 210.0, F252 = 252.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* i±5 valid: i0>=iminF+4, i0<=ex1-6 */
const int i0_lo = (iminF + 4 > 0) ? iminF + 4 : 0;
const int j0_lo = (jminF + 4 > 0) ? jminF + 4 : 0;
const int k0_lo = (kminF + 4 > 0) ? kminF + 4 : 0;
const int i0_hi = imaxF - 6;
const int j0_hi = jmaxF - 6;
const int k0_hi = kmaxF - 6;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 4;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 4;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 4;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Stencil: [1,-10,45,-120,210,-252,210,-120,45,-10,1] */
const double Dx = (
(fh[idx_fh_F_ord5(iF - 5, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 5, jF, kF, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)]) -
F120* (fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)]) +
F210* (fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord5(iF, jF - 5, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 5, kF, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)]) -
F120* (fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)]) +
F210* (fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord5(iF, jF, kF - 5, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 5, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)]) -
F120* (fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)]) +
F210* (fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
}
}
}
}
free(fh);
return;
}
#else
#error "kodiss_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}