Files
AMSS-NCKU/AMSS_NCKU_source/TwoPunctures.C
ianchb 914c4f4791 Optimize memory allocation in JFD_times_dv
This should reduce the pressure on the memory allocator, indirectly improving caching behavior.

Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>
2026-02-07 15:55:45 +08:00

3202 lines
107 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#ifdef newc
#include <assert.h>
#include <ctype.h>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <strstream>
#include <cmath>
#include <cstdio>
#include <complex>
using namespace std;
#else
#include <assert.h>
#include <ctype.h>
#include <iostream.h>
#include <iomanip.h>
#include <fstream.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <complex.h>
#endif
#include "TwoPunctures.h"
#include <mkl_cblas.h>
TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz,
double S_plusx, double S_plusy, double S_plusz,
double P_minusx, double P_minusy, double P_minusz,
double S_minusx, double S_minusy, double S_minusz,
int nA, int nB, int nphi,
double Mp, double Mm, double admtol, double Newtontol,
int Newtonmaxit) : par_m_plus(mp), par_m_minus(mm), par_b(b), npoints_A(nA),
npoints_B(nB), npoints_phi(nphi), target_M_plus(Mp), target_M_minus(Mm),
adm_tol(admtol), Newton_tol(Newtontol), Newton_maxit(Newtonmaxit)
{
par_P_plus[0] = P_plusx;
par_P_plus[1] = P_plusy;
par_P_plus[2] = P_plusz;
par_P_minus[0] = P_minusx;
par_P_minus[1] = P_minusy;
par_P_minus[2] = P_minusz;
par_S_plus[0] = S_plusx;
par_S_plus[1] = S_plusy;
par_S_plus[2] = S_plusz;
par_S_minus[0] = S_minusx;
par_S_minus[1] = S_minusy;
par_S_minus[2] = S_minusz;
int const nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
ntotal = n1 * n2 * n3 * nvar;
F = dvector(0, ntotal - 1);
allocate_derivs(&u, ntotal);
allocate_derivs(&v, ntotal);
D1_A = NULL; D2_A = NULL; D1_B = NULL; D2_B = NULL;
DF1_phi = NULL; DF2_phi = NULL;
precompute_derivative_matrices();
allocate_workspace();
}
TwoPunctures::~TwoPunctures()
{
free_dvector(F, 0, ntotal - 1);
free_derivs(&u, ntotal);
free_derivs(&v, ntotal);
free_workspace();
if (D1_A) delete[] D1_A;
if (D2_A) delete[] D2_A;
if (D1_B) delete[] D1_B;
if (D2_B) delete[] D2_B;
if (DF1_phi) delete[] DF1_phi;
if (DF2_phi) delete[] DF2_phi;
}
void TwoPunctures::Solve()
{
double mp = par_m_plus;
double mm = par_m_minus;
enum GRID_SETUP_METHOD
{
GSM_Taylor_expansion,
GSM_evaluation
};
enum GRID_SETUP_METHOD gsm;
int antisymmetric_lapse, averaged_lapse, pmn_lapse, brownsville_lapse;
int const nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
int imin[3], imax[3];
int const ntotal = n1 * n2 * n3 * nvar;
// double admMass;
/* initialise to 0 */
for (int j = 0; j < ntotal; j++)
{
v.d0[j] = 0.0;
v.d1[j] = 0.0;
v.d2[j] = 0.0;
v.d3[j] = 0.0;
v.d11[j] = 0.0;
v.d12[j] = 0.0;
v.d13[j] = 0.0;
v.d22[j] = 0.0;
v.d23[j] = 0.0;
v.d33[j] = 0.0;
}
double tmp, Mp_adm, Mm_adm, Mp_adm_err, Mm_adm_err, up, um;
double M_p = target_M_plus;
double M_m = target_M_minus;
/* If bare masses are not given, iteratively solve for them given the
target ADM masses target_M_plus and target_M_minus and with initial
guesses given by par_m_plus and par_m_minus. */
if (par_m_plus < 0 || par_m_minus < 0)
{
par_m_plus = target_M_plus;
par_m_minus = target_M_minus;
cout << "Attempting to find bare masses." << endl;
cout << "Target ADM masses: M_p=" << M_p << " and M_m=" << M_m << endl;
cout << "ADM mass tolerance: " << adm_tol << endl;
/* Loop until both ADM masses are within adm_tol of their target */
do
{
cout << "Bare masses: mp=" << mp << ", mm=" << mm << endl;
Newton(nvar, n1, n2, n3, v, Newton_tol, 1);
F_of_v(nvar, n1, n2, n3, v, F, u);
up = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v, par_b, 0., 0.);
um = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v, -par_b, 0., 0.);
/* Calculate the ADM masses from the current bare mass guess PRD 70, 064011 (2004) Eq.(83)*/
Mp_adm = (1 + up) * mp + mp * mm / (4. * par_b);
Mm_adm = (1 + um) * mm + mp * mm / (4. * par_b);
/* Check how far the current ADM masses are from the target */
Mp_adm_err = fabs(M_p - Mp_adm);
Mm_adm_err = fabs(M_m - Mm_adm);
cout << "ADM mass error: M_p_err=" << Mp_adm_err << ", M_m_err=" << Mm_adm_err << endl;
/* Invert the ADM mass equation and update the bare mass guess so that
it gives the correct target ADM masses */
tmp = -4 * par_b * (1 + um + up + um * up) +
sqrt(16 * par_b * M_m * (1 + um) * (1 + up) +
pow(-M_m + M_p + 4 * par_b * (1 + um) * (1 + up), 2));
par_m_plus = mp = (tmp + M_p - M_m) / (2. * (1 + up));
par_m_minus = mm = (tmp - M_p + M_m) / (2. * (1 + um));
} while ((Mp_adm_err > adm_tol) ||
(Mm_adm_err > adm_tol));
cout << "Found bare masses resulted Mp = " << Mp_adm << " and Mm = " << Mm_adm << endl;
}
Newton(nvar, n1, n2, n3, v, Newton_tol, Newton_maxit);
F_of_v(nvar, n1, n2, n3, v, F, u);
up = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v, par_b, 0., 0.);
um = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v, -par_b, 0., 0.);
/* Calculate the ADM masses from the current bare mass guess PRD 70, 064011 (2004) Eq.(83)*/
Mp_adm = (1 + up) * mp + mp * mm / (4. * par_b);
Mm_adm = (1 + um) * mm + mp * mm / (4. * par_b);
cout << "The two puncture masses are mp = " << mp << " and mm = " << mm << endl;
cout << " resulted Mp = " << Mp_adm << " and Mm = " << Mm_adm << endl;
/* print out ADM mass, eq.: \Delta M_ADM=2*r*u=4*b*V for A=1,B=0,phi=0 PRD 70, 064011 (2004) Eq.(81)*/
admMass = (mp + mm - 4 * par_b * PunctEvalAtArbitPosition(v.d0, 0, 1, 0, 0, nvar, n1, n2, n3));
cout << "The total ADM mass is " << admMass << endl;
target_M_plus = Mp_adm;
target_M_minus = Mm_adm;
}
void TwoPunctures::Save(char *fname)
{
ofstream outfile;
outfile.open(fname, ios::trunc);
time_t tnow;
time(&tnow);
struct tm *loc_time;
loc_time = localtime(&tnow);
outfile << "#File created on " << asctime(loc_time);
outfile << "#Newton_tol = " << Newton_tol << endl;
outfile << "#Mp = " << target_M_plus << endl;
outfile << "#Mm = " << target_M_minus << endl;
double D = 2 * par_b, x1, x2;
x1 = D * target_M_minus / (target_M_plus + target_M_minus);
x2 = -D * target_M_plus / (target_M_plus + target_M_minus);
// in order to relate Brugmann's convention, rotate xy
outfile << "bhmass1 = " << par_m_plus << endl;
outfile << "bhx1 = " << 0 << endl;
outfile << "bhy1 = " << x1 << endl;
outfile << "bhz1 = " << 0 << endl;
outfile << "bhpx1 = " << -par_P_plus[1] << endl;
outfile << "bhpy1 = " << par_P_plus[0] << endl;
outfile << "bhpz1 = " << par_P_plus[2] << endl;
outfile << "bhsx1 = " << -par_S_plus[1] << endl;
outfile << "bhsy1 = " << par_S_plus[0] << endl;
outfile << "bhsz1 = " << par_S_plus[2] << endl;
outfile << "bhmass2 = " << par_m_minus << endl;
outfile << "bhx2 = " << 0 << endl;
outfile << "bhy2 = " << x2 << endl;
outfile << "bhz2 = " << 0 << endl;
outfile << "bhpx2 = " << -par_P_minus[1] << endl;
outfile << "bhpy2 = " << par_P_minus[0] << endl;
outfile << "bhpz2 = " << par_P_minus[2] << endl;
outfile << "bhsx2 = " << -par_S_minus[1] << endl;
outfile << "bhsy2 = " << par_S_minus[0] << endl;
outfile << "bhsz2 = " << par_S_minus[2] << endl;
int const n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
outfile << "data " << n1 << " " << n2 << " " << n3 << endl;
int ntotal = n1 * n2 * n3;
outfile.setf(ios::scientific, ios::floatfield);
outfile.precision(16);
for (int i = 0; i < ntotal; i++)
outfile << v.d0[i] << endl;
outfile.close();
// add output to facilitate python reading of puncture data, by Xiaoqu 2024/12/04
ofstream outfile2;
outfile2.open("puncture_parameters_new.txt", ios::trunc);
// note that in this program the xy plane has been rotated
outfile2 << setw(18) << setprecision(10) << par_m_plus
<< setw(18) << setprecision(10) << target_M_plus
<< setw(18) << setprecision(10) << admMass << " # bare mass 1 mass 1 ADM mass" << endl;
outfile2 << setw(18) << setprecision(10) << 0.0
<< setw(18) << setprecision(10) << x1
<< setw(18) << setprecision(10) << 0.0 << " # position 1" << endl;
outfile2 << setw(18) << setprecision(10) << -par_P_plus[1]
<< setw(18) << setprecision(10) << par_P_plus[0]
<< setw(18) << setprecision(10) << par_P_plus[2] << " # momentum 1" << endl;
outfile2 << setw(18) << setprecision(10) << -par_S_plus[1]
<< setw(18) << setprecision(10) << par_S_plus[0]
<< setw(18) << setprecision(10) << par_S_plus[2] << " # angular mumentum 1" << endl;
outfile2 << setw(18) << setprecision(10) << par_m_minus
<< setw(18) << setprecision(10) << target_M_minus
<< setw(18) << setprecision(10) << admMass << " # bare mass 2 mass 2 ADM mass" << endl;
outfile2 << setw(18) << setprecision(10) << 0.0
<< setw(18) << setprecision(10) << x2
<< setw(18) << setprecision(10) << 0.0 << " # position 2" << endl;
outfile2 << setw(18) << setprecision(10) << -par_P_minus[1]
<< setw(18) << setprecision(10) << par_P_minus[0]
<< setw(18) << setprecision(10) << par_P_minus[2] << " # momentum 2" << endl;
outfile2 << setw(18) << setprecision(10) << -par_S_minus[1]
<< setw(18) << setprecision(10) << par_S_minus[0]
<< setw(18) << setprecision(10) << par_S_minus[2] << " # angular mumentum 2" << endl;
outfile2.close();
}
void TwoPunctures::set_initial_guess(derivs v)
{
int nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
double *s_x, *s_y, *s_z; // Cartesian x,y,z
double al, A, Am1, be, B, phi, R, r, X;
int ivar, i, j, k, i3D, indx;
derivs U;
FILE *debug_file;
s_x = (double *)calloc(n1 * n2 * n3, sizeof(double));
s_y = (double *)calloc(n1 * n2 * n3, sizeof(double));
s_z = (double *)calloc(n1 * n2 * n3, sizeof(double));
allocate_derivs(&U, nvar);
for (ivar = 0; ivar < nvar; ivar++)
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
for (k = 0; k < n3; k++)
{
i3D = Index(ivar, i, j, k, 1, n1, n2, n3);
al = Pih * (2 * i + 1) / n1;
A = -cos(al);
be = Pih * (2 * j + 1) / n2;
B = -cos(be);
phi = 2. * Pi * k / n3;
/* Calculation of (X,R)*/
AB_To_XR(nvar, A, B, &X, &R, U);
/* Calculation of (x,r)*/
C_To_c(nvar, X, R, &(s_x[i3D]), &r, U);
/* Calculation of (y,z)*/
rx3_To_xyz(nvar, s_x[i3D], r, phi, &(s_y[i3D]), &(s_z[i3D]), U);
}
// Set_Initial_Guess_for_u(n1*n2*n3, v.d0, s_x, s_y, s_z); //extern fortran code to set initial guess
for (ivar = 0; ivar < nvar; ivar++)
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
for (k = 0; k < n3; k++)
{
indx = Index(ivar, i, j, k, 1, n1, n2, n3);
v.d0[indx] = 0; // set initial guess 0
v.d0[indx] /= (-cos(Pih * (2 * i + 1) / n1) - 1.0); // PRD 70, 064011 (2004) Eq.(5), from u to U
}
Derivatives_AB3_MatMul(nvar, n1, n2, n3, v);
if (0)
{
debug_file = fopen("initial.dat", "w");
assert(debug_file);
for (ivar = 0; ivar < nvar; ivar++)
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
{
al = Pih * (2 * i + 1) / n1;
A = -cos(al);
Am1 = A - 1.0;
be = Pih * (2 * j + 1) / n2;
B = -cos(be);
phi = 0.0;
indx = Index(ivar, i, j, 0, 1, n1, n2, n3);
U.d0[0] = Am1 * v.d0[indx]; /* U*/
U.d1[0] = v.d0[indx] + Am1 * v.d1[indx]; /* U_A*/
U.d2[0] = Am1 * v.d2[indx]; /* U_B*/
U.d3[0] = Am1 * v.d3[indx]; /* U_3*/
U.d11[0] = 2 * v.d1[indx] + Am1 * v.d11[indx]; /* U_AA*/
U.d12[0] = v.d2[indx] + Am1 * v.d12[indx]; /* U_AB*/
U.d13[0] = v.d3[indx] + Am1 * v.d13[indx]; /* U_AB*/
U.d22[0] = Am1 * v.d22[indx]; /* U_BB*/
U.d23[0] = Am1 * v.d23[indx]; /* U_B3*/
U.d33[0] = Am1 * v.d33[indx]; /* U_33*/
/* Calculation of (X,R)*/
AB_To_XR(nvar, A, B, &X, &R, U);
/* Calculation of (x,r)*/
C_To_c(nvar, X, R, &(s_x[indx]), &r, U);
/* Calculation of (y,z)*/
rx3_To_xyz(nvar, s_x[i3D], r, phi, &(s_y[indx]), &(s_z[indx]), U);
fprintf(debug_file,
"%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g "
"%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g\n",
(double)s_x[indx], (double)s_y[indx],
(double)A, (double)B,
(double)U.d0[0],
(double)(-cos(Pih * (2 * i + 1) / n1) - 1.0),
(double)U.d1[0],
(double)U.d2[0],
(double)U.d3[0],
(double)U.d11[0],
(double)U.d22[0],
(double)U.d33[0],
(double)v.d0[indx],
(double)v.d1[indx],
(double)v.d2[indx],
(double)v.d3[indx],
(double)v.d11[indx],
(double)v.d22[indx],
(double)v.d33[indx]);
}
fprintf(debug_file, "\n\n");
for (i = n2 - 10; i < n2; i++)
{
double d;
indx = Index(0, 0, i, 0, 1, n1, n2, n3);
d = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v,
s_x[indx], 0.0, 0.0);
fprintf(debug_file, "%.16g %.16g\n",
(double)s_x[indx], (double)d);
}
fprintf(debug_file, "\n\n");
for (i = n2 - 10; i < n2 - 1; i++)
{
double d;
int ip = Index(0, 0, i + 1, 0, 1, n1, n2, n3);
indx = Index(0, 0, i, 0, 1, n1, n2, n3);
for (j = -10; j < 10; j++)
{
d = PunctIntPolAtArbitPosition(0, nvar, n1, n2, n3, v,
s_x[indx] + (s_x[ip] - s_x[indx]) * j / 10,
0.0, 0.0);
fprintf(debug_file, "%.16g %.16g\n",
(double)(s_x[indx] + (s_x[ip] - s_x[indx]) * j / 10), (double)d);
}
}
fprintf(debug_file, "\n\n");
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
{
X = 2 * (2.0 * i / n1 - 1.0);
R = 2 * (1.0 * j / n2);
if (X * X + R * R > 1.0)
{
C_To_c(nvar, X, R, &(s_x[indx]), &r, U);
rx3_To_xyz(nvar, s_x[i3D], r, phi, &(s_y[indx]), &(s_z[indx]), U);
*U.d0 = s_x[indx] * s_x[indx];
*U.d1 = 2 * s_x[indx];
*U.d2 = 0.0;
*U.d3 = 0.0;
*U.d11 = 2.0;
*U.d22 = 0.0;
*U.d33 = *U.d12 = *U.d23 = *U.d13 = 0.0;
C_To_c(nvar, X, R, &(s_x[indx]), &r, U);
fprintf(debug_file,
"%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g\n",
(double)s_x[indx], (double)r, (double)X, (double)R, (double)U.d0[0],
(double)U.d1[0],
(double)U.d2[0],
(double)U.d3[0],
(double)U.d11[0],
(double)U.d22[0],
(double)U.d33[0]);
}
}
fclose(debug_file);
}
free(s_z);
free(s_y);
free(s_x);
free_derivs(&U, nvar);
}
// some tools
/*---------------------------------------------------------------------------*/
int TwoPunctures::index(int i, int j, int k, int l, int a, int b, int c, int d)
{
int rr = 0;
rr = l + k * d + j * d * c + i * d * c * b;
return rr;
}
/*---------------------------------------------------------------------------*/
int *TwoPunctures::ivector(long nl, long nh)
/* allocate an int vector with subscript range v[nl..nh] */
{
int *retval;
retval = (int *)malloc(sizeof(int) * (nh - nl + 1));
if (retval == NULL)
cout << "allocation failure in ivector()" << endl;
return retval - nl;
}
/*---------------------------------------------------------------------------*/
double *TwoPunctures::dvector(long nl, long nh)
/* allocate a double vector with subscript range v[nl..nh] */
{
double *retval;
retval = (double *)malloc(sizeof(double) * (nh - nl + 1));
if (retval == NULL)
cout << "allocation failure in dvector()" << endl;
return retval - nl;
}
/*---------------------------------------------------------------------------*/
int **TwoPunctures::imatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */
{
int **retval;
retval = (int **)malloc(sizeof(int *) * (nrh - nrl + 1));
if (retval == NULL)
cout << "allocation failure (1) in imatrix()" << endl;
/* get all memory for the matrix in on chunk */
retval[0] = (int *)malloc(sizeof(int) * (nrh - nrl + 1) * (nch - ncl + 1));
if (retval[0] == NULL)
cout << "allocation failure (2) in imatrix()" << endl;
/* apply column and row offsets */
retval[0] -= ncl;
retval -= nrl;
/* slice chunk into rows */
long width = (nch - ncl + 1);
for (long i = nrl + 1; i <= nrh; i++)
retval[i] = retval[i - 1] + width;
assert(retval[nrh] - retval[nrl] == (nrh - nrl) * width);
return retval;
}
/*---------------------------------------------------------------------------*/
double **TwoPunctures::dmatrix(long nrl, long nrh, long ncl, long nch)
/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
{
double **retval;
retval = (double **)malloc(sizeof(double *) * (nrh - nrl + 1));
if (retval == NULL)
cout << "allocation failure (1) in dmatrix()" << endl;
/* get all memory for the matrix in on chunk */
retval[0] = (double *)malloc(sizeof(double) * (nrh - nrl + 1) * (nch - ncl + 1));
if (retval[0] == NULL)
cout << "allocation failure (2) in dmatrix()" << endl;
/* apply column and row offsets */
retval[0] -= ncl;
retval -= nrl;
/* slice chunk into rows */
long width = (nch - ncl + 1);
for (long i = nrl + 1; i <= nrh; i++)
retval[i] = retval[i - 1] + width;
assert(retval[nrh] - retval[nrl] == (nrh - nrl) * width);
return retval;
}
/*---------------------------------------------------------------------------*/
double ***TwoPunctures::d3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)
/* allocate a double 3tensor with range t[nrl..nrh][ncl..nch][ndl..ndh] */
{
double ***retval;
/* get memory for index structures */
retval = (double ***)malloc(sizeof(double **) * (nrh - nrl + 1));
if (retval == NULL)
cout << "allocation failure (1) in dmatrix()" << endl;
retval[0] = (double **)malloc(sizeof(double *) * (nrh - nrl + 1) * (nch - ncl + 1));
if (retval[0] == NULL)
cout << "allocation failure (2) in dmatrix()" << endl;
/* get all memory for the tensor in on chunk */
retval[0][0] = (double *)malloc(sizeof(double) * (nrh - nrl + 1) * (nch - ncl + 1) * (nrh - nrl + 1));
if (retval[0][0] == NULL)
cout << "allocation failure (3) in dmatrix()" << endl;
/* apply all offsets */
retval[0][0] -= ndl;
retval[0] -= ncl;
retval -= nrl;
/* slice chunk into rows and columns */
long width = (nch - ncl + 1);
long depth = (ndh - ndl + 1);
for (long j = ncl + 1; j <= nch; j++)
{ /* first row of columns */
retval[nrl][j] = retval[nrl][j - 1] + depth;
}
assert(retval[nrl][nch] - retval[nrl][ncl] == (nch - ncl) * depth);
for (long i = nrl + 1; i <= nrh; i++)
{
retval[i] = retval[i - 1] + width;
retval[i][ncl] = retval[i - 1][ncl] + width * depth; /* first cell in column */
for (long j = ncl + 1; j <= nch; j++)
{
retval[i][j] = retval[i][j - 1] + depth;
}
assert(retval[i][nch] - retval[i][ncl] == (nch - ncl) * depth);
}
assert(retval[nrh] - retval[nrl] == (nrh - nrl) * width);
assert(&retval[nrh][nch][ndh] - &retval[nrl][ncl][ndl] == (nrh - nrl + 1) * (nch - ncl + 1) * (ndh - ndl + 1) - 1);
return retval;
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::free_ivector(int *v, long nl, long nh)
/* free an int vector allocated with ivector() */
{
free(v + nl);
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::free_dvector(double *v, long nl, long nh)
/* free an double vector allocated with dvector() */
{
free(v + nl);
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::free_imatrix(int **m, long nrl, long nrh, long ncl, long nch)
/* free an int matrix allocated by imatrix() */
{
free(m[nrl] + ncl);
free(m + nrl);
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch)
/* free a double matrix allocated by dmatrix() */
{
free(m[nrl] + ncl);
free(m + nrl);
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::free_d3tensor(double ***t, long nrl, long nrh, long ncl, long nch,
long ndl, long ndh)
/* free a double f3tensor allocated by f3tensor() */
{
free(t[nrl][ncl] + ndl);
free(t[nrl] + ncl);
free(t + nrl);
}
/*--------------------------------------------------------------------------*/
int TwoPunctures::minimum2(int i, int j)
{
int result = i;
if (j < result)
result = j;
return result;
}
/*-------------------------------------------------------------------------*/
int TwoPunctures::minimum3(int i, int j, int k)
{
int result = i;
if (j < result)
result = j;
if (k < result)
result = k;
return result;
}
/*--------------------------------------------------------------------------*/
int TwoPunctures::maximum2(int i, int j)
{
int result = i;
if (j > result)
result = j;
return result;
}
/*--------------------------------------------------------------------------*/
int TwoPunctures::maximum3(int i, int j, int k)
{
int result = i;
if (j > result)
result = j;
if (k > result)
result = k;
return result;
}
/*--------------------------------------------------------------------------*/
int TwoPunctures::pow_int(int mantisse, int exponent)
{
int i, result = 1;
for (i = 1; i <= exponent; i++)
result *= mantisse;
return result;
}
/*--------------------------------------------------------------------------*/
void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
/* eq. 5.8.7 and 5.8.8 at x = (5.8.4) of 2nd edition C++ NR */
{
int k, j, isignum;
double fac, sum, Pion, *c;
c = dvector(0, n);
Pion = Pi / n;
if (inv == 0)
{
fac = 2.0 / n;
isignum = 1;
for (j = 0; j < n; j++)
{
sum = 0.0;
for (k = 0; k < n; k++)
sum += u[k] * cos(Pion * j * (k + 0.5));
c[j] = fac * sum * isignum;
isignum = -isignum;
}
}
else
{
for (j = 0; j < n; j++)
{
sum = -0.5 * u[0];
isignum = 1;
for (k = 0; k < n; k++)
{
sum += u[k] * cos(Pion * (j + 0.5) * k) * isignum;
isignum = -isignum;
}
c[j] = sum;
}
}
for (j = 0; j < n; j++)
u[j] = c[j];
free_dvector(c, 0, n);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::chebft_Extremes(double u[], int n, int inv)
/* eq. 5.8.7 and 5.8.8 at x = (5.8.5) of 2nd edition C++ NR */
{
int k, j, isignum, N = n - 1;
double fac, sum, PioN, *c;
c = dvector(0, N);
PioN = Pi / N;
if (inv == 0)
{
fac = 2.0 / N;
isignum = 1;
for (j = 0; j < n; j++)
{
sum = 0.5 * (u[0] + u[N] * isignum);
for (k = 1; k < N; k++)
sum += u[k] * cos(PioN * j * k);
c[j] = fac * sum * isignum;
isignum = -isignum;
}
c[N] = 0.5 * c[N];
}
else
{
for (j = 0; j < n; j++)
{
sum = -0.5 * u[0];
isignum = 1;
for (k = 0; k < n; k++)
{
sum += u[k] * cos(PioN * j * k) * isignum;
isignum = -isignum;
}
c[j] = sum;
}
}
for (j = 0; j < n; j++)
u[j] = c[j];
free_dvector(c, 0, N);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::chder(double *c, double *cder, int n)
{
int j;
cder[n] = 0.0;
cder[n - 1] = 0.0;
for (j = n - 2; j >= 0; j--)
cder[j] = cder[j + 2] + 2 * (j + 1) * c[j + 1];
}
/* --------------------------------------------------------------------------*/
double TwoPunctures::chebev(double a, double b, double c[], int m, double x)
/* eq. 5.8.11 of C++ NR (2nd ed) */
{
int j;
double djp2, djp1, dj; /* d_{j+2}, d_{j+1} and d_j */
double y;
/* rescale input to lie within [-1,1] */
y = 2 * (x - 0.5 * (b + a)) / (b - a);
dj = djp1 = 0;
for (j = m - 1; j >= 1; j--)
{
/* advance the coefficients */
djp2 = djp1;
djp1 = dj;
dj = 2 * y * djp1 - djp2 + c[j];
}
return y * dj - djp1 + 0.5 * c[0];
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::fourft(double *u, int N, int inv)
/* a (slow) Fourier transform, seems to be just eq. 12.1.6 and 12.1.9 of C++ NR (2nd ed) */
{
int l, k, iy, M;
double x, x1, fac, Pi_fac, *a, *b;
M = N / 2;
a = dvector(0, M);
b = dvector(1, M); /* Actually: b=vector(1,M-1) but this is problematic if M=1*/
fac = 1. / M;
Pi_fac = Pi * fac;
if (inv == 0)
{
for (l = 0; l <= M; l++)
{
a[l] = 0;
if (l > 0 && l < M)
b[l] = 0;
x1 = Pi_fac * l;
for (k = 0; k < N; k++)
{
x = x1 * k;
a[l] += fac * u[k] * cos(x);
if (l > 0 && l < M)
b[l] += fac * u[k] * sin(x);
}
}
u[0] = a[0];
u[M] = a[M];
for (l = 1; l < M; l++)
{
u[l] = a[l];
u[l + M] = b[l];
}
}
else
{
a[0] = u[0];
a[M] = u[M];
for (l = 1; l < M; l++)
{
a[l] = u[l];
b[l] = u[M + l];
}
iy = 1;
for (k = 0; k < N; k++)
{
u[k] = 0.5 * (a[0] + a[M] * iy);
x1 = Pi_fac * k;
for (l = 1; l < M; l++)
{
x = x1 * l;
u[k] += a[l] * cos(x) + b[l] * sin(x);
}
iy = -iy;
}
}
free_dvector(a, 0, M);
free_dvector(b, 1, M);
}
/* -----------------------------------------*/
void TwoPunctures::fourder(double u[], double du[], int N)
{
int l, M, lpM;
M = N / 2;
du[0] = 0.;
du[M] = 0.;
for (l = 1; l < M; l++)
{
lpM = l + M;
du[l] = u[lpM] * l;
du[lpM] = -u[l] * l;
}
}
/* -----------------------------------------*/
void TwoPunctures::fourder2(double u[], double d2u[], int N)
{
int l, l2, M, lpM;
d2u[0] = 0.;
M = N / 2;
for (l = 1; l <= M; l++)
{
l2 = l * l;
lpM = l + M;
d2u[l] = -u[l] * l2;
if (l < M)
d2u[lpM] = -u[lpM] * l2;
}
}
/* ----------------------------------------- */
double TwoPunctures::fourev(double *u, int N, double x)
{
int l, M = N / 2;
double xl, result;
result = 0.5 * (u[0] + u[M] * cos(x * M));
for (l = 1; l < M; l++)
{
xl = x * l;
result += u[l] * cos(xl) + u[M + l] * sin(xl);
}
return result;
}
/* ------------------------------------------------------------------------*/
double TwoPunctures::norm1(double *v, int n)
{
int i;
double result = -1;
for (i = 0; i < n; i++)
if (fabs(v[i]) > result)
result = fabs(v[i]);
return result;
}
/* -------------------------------------------------------------------------*/
double TwoPunctures::norm2(double *v, int n)
{
// Optimized with oneMKL BLAS DNRM2
// Computes: sqrt(sum(v[i]^2))
return cblas_dnrm2(n, v, 1);
}
/* -------------------------------------------------------------------------*/
double TwoPunctures::scalarproduct(double *v, double *w, int n)
{
// Optimized with oneMKL BLAS DDOT
// Computes: sum(v[i] * w[i])
return cblas_ddot(n, v, 1, w, 1);
}
/* -------------------------------------------------------------------------*/
/* Calculates the value of v at an arbitrary position (x,y,z)*/
double TwoPunctures::PunctIntPolAtArbitPosition(int ivar, int nvar, int n1,
int n2, int n3, derivs v, double x, double y,
double z)
{
double xs, ys, zs, rs2, phi, X, R, A, B, aux1, aux2, result, Ui;
xs = x / par_b;
ys = y / par_b;
zs = z / par_b;
rs2 = ys * ys + zs * zs;
phi = atan2(z, y);
if (phi < 0)
phi += 2 * Pi;
aux1 = 0.5 * (xs * xs + rs2 - 1);
aux2 = sqrt(aux1 * aux1 + rs2);
X = asinh(sqrt(aux1 + aux2));
R = asin(min(1.0, sqrt(-aux1 + aux2)));
if (x < 0)
R = Pi - R;
A = 2 * tanh(0.5 * X) - 1;
B = tan(0.5 * R - Piq);
result = PunctEvalAtArbitPosition(v.d0, ivar, A, B, phi, nvar, n1, n2, n3);
Ui = (A - 1) * result;
return Ui;
}
/* Calculates the value of v at an arbitrary position (A,B,phi)*/
double TwoPunctures::PunctEvalAtArbitPosition(double *v, int ivar, double A, double B, double phi,
int nvar, int n1, int n2, int n3)
{
int i, j, k, N;
double *p, *values1, **values2, result;
N = maximum3(n1, n2, n3);
p = dvector(0, N);
values1 = dvector(0, N);
values2 = dmatrix(0, N, 0, N);
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
{
for (i = 0; i < n1; i++)
p[i] = v[ivar + nvar * (i + n1 * (j + n2 * k))];
chebft_Zeros(p, n1, 0);
values2[j][k] = chebev(-1, 1, p, n1, A);
}
}
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
p[j] = values2[j][k];
chebft_Zeros(p, n2, 0);
values1[k] = chebev(-1, 1, p, n2, B);
}
fourft(values1, n3, 0);
result = fourev(values1, n3, phi);
free_dvector(p, 0, N);
free_dvector(values1, 0, N);
free_dmatrix(values2, 0, N, 0, N);
return result;
}
/*-----------------------------------------------------------*/
void TwoPunctures::AB_To_XR(int nvar, double A, double B, double *X, double *R,
derivs U)
/* On Entrance: U.d0[]=U[]; U.d1[] =U[]_A; U.d2[] =U[]_B; U.d3[] =U[]_3; */
/* U.d11[]=U[]_AA; U.d12[]=U[]_AB; U.d13[]=U[]_A3; */
/* U.d22[]=U[]_BB; U.d23[]=U[]_B3; U.d33[]=U[]_33; */
/* At Exit: U.d0[]=U[]; U.d1[] =U[]_X; U.d2[] =U[]_R; U.d3[] =U[]_3; */
/* U.d11[]=U[]_XX; U.d12[]=U[]_XR; U.d13[]=U[]_X3; */
/* U.d22[]=U[]_RR; U.d23[]=U[]_R3; U.d33[]=U[]_33; */
{
double At = 0.5 * (A + 1), A_X, A_XX, B_R, B_RR;
int ivar;
*X = 2 * atanh(At);
*R = Pih + 2 * atan(B);
A_X = 1 - At * At;
A_XX = -At * A_X;
B_R = 0.5 * (1 + B * B);
B_RR = B * B_R;
for (ivar = 0; ivar < nvar; ivar++)
{
U.d11[ivar] = A_X * A_X * U.d11[ivar] + A_XX * U.d1[ivar];
U.d12[ivar] = A_X * B_R * U.d12[ivar];
U.d13[ivar] = A_X * U.d13[ivar];
U.d22[ivar] = B_R * B_R * U.d22[ivar] + B_RR * U.d2[ivar];
U.d23[ivar] = B_R * U.d23[ivar];
U.d1[ivar] = A_X * U.d1[ivar];
U.d2[ivar] = B_R * U.d2[ivar];
}
}
/*-----------------------------------------------------------*/
void TwoPunctures::C_To_c(int nvar, double X, double R, double *x, double *r,
derivs U)
/* On Entrance: U.d0[]=U[]; U.d1[] =U[]_X; U.d2[] =U[]_R; U.d3[] =U[]_3; */
/* U.d11[]=U[]_XX; U.d12[]=U[]_XR; U.d13[]=U[]_X3; */
/* U.d22[]=U[]_RR; U.d23[]=U[]_R3; U.d33[]=U[]_33; */
/* At Exit: U.d0[]=U[]; U.d1[] =U[]_x; U.d2[] =U[]_r; U.d3[] =U[]_3; */
/* U.d11[]=U[]_xx; U.d12[]=U[]_xr; U.d13[]=U[]_x3; */
/* U.d22[]=U[]_rr; U.d23[]=U[]_r3; U.d33[]=U[]_33; */
{
double C_c2, U_cb, U_CB;
complex<double> C, C_c, C_cc, c, c_C, c_CC, U_c, U_cc, U_C, U_CC;
int ivar;
C = complex<double>(X, R);
c = cosh(C) * par_b; /* c=b*cosh(C)*/
c_C = sinh(C) * par_b;
c_CC = c;
C_c = complex<double>(1, 0) / c_C;
C_cc = -C_c * C_c * C_c * c_CC;
C_c2 = abs(C_c);
C_c2 = C_c2 * C_c2;
for (ivar = 0; ivar < nvar; ivar++)
{
/* U_C = 0.5*(U_X3-i*U_R3)*/
/* U_c = U_C*C_c = 0.5*(U_x3-i*U_r3)*/
U_C = complex<double>(0.5 * U.d13[ivar], -0.5 * U.d23[ivar]);
U_c = U_C * C_c;
U.d13[ivar] = 2. * real(U_c);
U.d23[ivar] = -2. * imag(U_c);
/* U_C = 0.5*(U_X-i*U_R)*/
/* U_c = U_C*C_c = 0.5*(U_x-i*U_r)*/
U_C = complex<double>(0.5 * U.d1[ivar], -0.5 * U.d2[ivar]);
U_c = U_C * C_c;
U.d1[ivar] = 2. * real(U_c);
U.d2[ivar] = -2. * imag(U_c);
/* U_CC = 0.25*(U_XX-U_RR-2*i*U_XR)*/
/* U_CB = d^2(U)/(dC*d\bar{C}) = 0.25*(U_XX+U_RR)*/
U_CC = complex<double>(0.25 * (U.d11[ivar] - U.d22[ivar]), -0.5 * U.d12[ivar]);
U_CB = 0.25 * (U.d11[ivar] + U.d22[ivar]);
/* U_cc = C_cc*U_C+(C_c)^2*U_CC*/
U_cb = U_CB * C_c2;
U_cc = C_cc * U_C + C_c * C_c * U_CC;
/* U_xx = 2*(U_cb+Re[U_cc])*/
/* U_rr = 2*(U_cb-Re[U_cc])*/
/* U_rx = -2*Im[U_cc]*/
U.d11[ivar] = 2 * (U_cb + real(U_cc));
U.d22[ivar] = 2 * (U_cb - real(U_cc));
U.d12[ivar] = -2 * imag(U_cc);
}
*x = real(c);
*r = imag(c);
}
/*-----------------------------------------------------------*/
void TwoPunctures::rx3_To_xyz(int nvar, double x, double r, double phi,
double *y, double *z, derivs U)
/* On Entrance: U.d0[]=U[]; U.d1[] =U[]_x; U.d2[] =U[]_r; U.d3[] =U[]_3; */
/* U.d11[]=U[]_xx; U.d12[]=U[]_xr; U.d13[]=U[]_x3; */
/* U.d22[]=U[]_rr; U.d23[]=U[]_r3; U.d33[]=U[]_33; */
/* At Exit: U.d0[]=U[]; U.d1[] =U[]_x; U.d2[] =U[]_y; U.dz[] =U[]_z; */
/* U.d11[]=U[]_xx; U.d12[]=U[]_xy; U.d1z[]=U[]_xz; */
/* U.d22[]=U[]_yy; U.d2z[]=U[]_yz; U.dzz[]=U[]_zz; */
{
int jvar;
double
sin_phi = sin(phi),
cos_phi = cos(phi),
sin2_phi = sin_phi * sin_phi,
cos2_phi = cos_phi * cos_phi,
sin_2phi = 2 * sin_phi * cos_phi,
cos_2phi = cos2_phi - sin2_phi, r_inv = 1 / r, r_inv2 = r_inv * r_inv;
*y = r * cos_phi;
*z = r * sin_phi;
for (jvar = 0; jvar < nvar; jvar++)
{
double U_x = U.d1[jvar], U_r = U.d2[jvar], U_3 = U.d3[jvar],
U_xx = U.d11[jvar], U_xr = U.d12[jvar], U_x3 = U.d13[jvar],
U_rr = U.d22[jvar], U_r3 = U.d23[jvar], U_33 = U.d33[jvar];
U.d1[jvar] = U_x; /* U_x*/
U.d2[jvar] = U_r * cos_phi - U_3 * r_inv * sin_phi; /* U_y*/
U.d3[jvar] = U_r * sin_phi + U_3 * r_inv * cos_phi; /* U_z*/
U.d11[jvar] = U_xx; /* U_xx*/
U.d12[jvar] = U_xr * cos_phi - U_x3 * r_inv * sin_phi; /* U_xy*/
U.d13[jvar] = U_xr * sin_phi + U_x3 * r_inv * cos_phi; /* U_xz*/
U.d22[jvar] = U_rr * cos2_phi + r_inv2 * sin2_phi * (U_33 + r * U_r) /* U_yy*/
+ sin_2phi * r_inv2 * (U_3 - r * U_r3);
U.d23[jvar] = 0.5 * sin_2phi * (U_rr - r_inv * U_r - r_inv2 * U_33) /* U_yz*/
- cos_2phi * r_inv2 * (U_3 - r * U_r3);
U.d33[jvar] = U_rr * sin2_phi + r_inv2 * cos2_phi * (U_33 + r * U_r) /* U_zz*/
- sin_2phi * r_inv2 * (U_3 - r * U_r3);
}
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
{
int i, j, k, ivar, N, *indx;
double *p, *dp, *d2p, *q, *dq, *r, *dr;
N = maximum3(n1, n2, n3);
p = dvector(0, N);
dp = dvector(0, N);
d2p = dvector(0, N);
q = dvector(0, N);
dq = dvector(0, N);
r = dvector(0, N);
dr = dvector(0, N);
indx = ivector(0, N);
for (ivar = 0; ivar < nvar; ivar++)
{
for (k = 0; k < n3; k++)
{ /* Calculation of Derivatives w.r.t. A-Dir. */
for (j = 0; j < n2; j++)
{ /* (Chebyshev_Zeros)*/
for (i = 0; i < n1; i++)
{
indx[i] = Index(ivar, i, j, k, nvar, n1, n2, n3);
p[i] = v.d0[indx[i]];
}
chebft_Zeros(p, n1, 0);
chder(p, dp, n1);
chder(dp, d2p, n1);
chebft_Zeros(dp, n1, 1);
chebft_Zeros(d2p, n1, 1);
for (i = 0; i < n1; i++)
{
v.d1[indx[i]] = dp[i];
v.d11[indx[i]] = d2p[i];
}
}
}
for (k = 0; k < n3; k++)
{ /* Calculation of Derivatives w.r.t. B-Dir. */
for (i = 0; i < n1; i++)
{ /* (Chebyshev_Zeros)*/
for (j = 0; j < n2; j++)
{
indx[j] = Index(ivar, i, j, k, nvar, n1, n2, n3);
p[j] = v.d0[indx[j]];
q[j] = v.d1[indx[j]];
}
chebft_Zeros(p, n2, 0);
chebft_Zeros(q, n2, 0);
chder(p, dp, n2);
chder(dp, d2p, n2);
chder(q, dq, n2);
chebft_Zeros(dp, n2, 1);
chebft_Zeros(d2p, n2, 1);
chebft_Zeros(dq, n2, 1);
for (j = 0; j < n2; j++)
{
v.d2[indx[j]] = dp[j];
v.d22[indx[j]] = d2p[j];
v.d12[indx[j]] = dq[j];
}
}
}
for (i = 0; i < n1; i++)
{ /* Calculation of Derivatives w.r.t. phi-Dir. (Fourier)*/
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
{
indx[k] = Index(ivar, i, j, k, nvar, n1, n2, n3);
p[k] = v.d0[indx[k]];
q[k] = v.d1[indx[k]];
r[k] = v.d2[indx[k]];
}
fourft(p, n3, 0);
fourder(p, dp, n3);
fourder2(p, d2p, n3);
fourft(dp, n3, 1);
fourft(d2p, n3, 1);
fourft(q, n3, 0);
fourder(q, dq, n3);
fourft(dq, n3, 1);
fourft(r, n3, 0);
fourder(r, dr, n3);
fourft(dr, n3, 1);
for (k = 0; k < n3; k++)
{
v.d3[indx[k]] = dp[k];
v.d33[indx[k]] = d2p[k];
v.d13[indx[k]] = dq[k];
v.d23[indx[k]] = dr[k];
}
}
}
}
free_dvector(p, 0, N);
free_dvector(dp, 0, N);
free_dvector(d2p, 0, N);
free_dvector(q, 0, N);
free_dvector(dq, 0, N);
free_dvector(r, 0, N);
free_dvector(dr, 0, N);
free_ivector(indx, 0, N);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::Newton(int const nvar, int const n1, int const n2, int const n3,
derivs v, double const tol, int const itmax)
{
int ntotal = n1 * n2 * n3 * nvar, ii, it;
double *F, dmax, normres;
derivs u, dv;
F = dvector(0, ntotal - 1);
allocate_derivs(&dv, ntotal);
allocate_derivs(&u, ntotal);
it = 0;
dmax = 1;
while (dmax > tol && it < itmax)
{
if (it == 0)
{
F_of_v(nvar, n1, n2, n3, v, F, u);
dmax = norm_inf(F, ntotal);
}
for (int j = 0; j < ntotal; j++)
dv.d0[j] = 0;
{
printf("Newton: it=%d \t |F|=%e\n", it, (double)dmax);
printf("bare mass: mp=%g \t mm=%g\n", (double)par_m_plus, (double)par_m_minus);
}
fflush(stdout);
ii = bicgstab(nvar, n1, n2, n3, v, dv, 100, dmax * 1.e-3, &normres);
for (int j = 0; j < ntotal; j++)
v.d0[j] -= dv.d0[j];
F_of_v(nvar, n1, n2, n3, v, F, u);
dmax = norm_inf(F, ntotal);
it += 1;
}
if (itmax == 0)
{
F_of_v(nvar, n1, n2, n3, v, F, u);
dmax = norm_inf(F, ntotal);
}
printf("Newton: it=%d \t |F|=%e \n", it, (double)dmax);
fflush(stdout);
free_dvector(F, 0, ntotal - 1);
free_derivs(&dv, ntotal);
free_derivs(&u, ntotal);
}
#define FAC sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be)
/* --------------------------------------------------------------------------*/
void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
derivs u)
{
/* Calculates the left hand sides of the non-linear equations F_m(v_n)=0*/
/* and the function u (u.d0[]) as well as its derivatives*/
/* (u.d1[], u.d2[], u.d3[], u.d11[], u.d12[], u.d13[], u.d22[], u.d23[], u.d33[])*/
/* at interior points and at the boundaries "+/-"*/
int i, j, k, ivar, indx;
double al, be, A, B, X, R, x, r, phi, y, z, Am1, *values;
derivs U;
double *sources;
sources = (double *)calloc(n1 * n2 * n3, sizeof(double));
if (0)
{
double *s_x, *s_y, *s_z;
int i3D;
s_x = (double *)calloc(n1 * n2 * n3, sizeof(double));
s_y = (double *)calloc(n1 * n2 * n3, sizeof(double));
s_z = (double *)calloc(n1 * n2 * n3, sizeof(double));
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
for (k = 0; k < n3; k++)
{
i3D = Index(0, i, j, k, 1, n1, n2, n3);
al = Pih * (2 * i + 1) / n1;
A = -cos(al);
be = Pih * (2 * j + 1) / n2;
B = -cos(be);
phi = 2. * Pi * k / n3;
Am1 = A - 1;
for (ivar = 0; ivar < nvar; ivar++)
{
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
U.d0[ivar] = Am1 * v.d0[indx]; /* U*/
U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx]; /* U_A*/
U.d2[ivar] = Am1 * v.d2[indx]; /* U_B*/
U.d3[ivar] = Am1 * v.d3[indx]; /* U_3*/
U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; /* U_AA*/
U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx]; /* U_AB*/
U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx]; /* U_AB*/
U.d22[ivar] = Am1 * v.d22[indx]; /* U_BB*/
U.d23[ivar] = Am1 * v.d23[indx]; /* U_B3*/
U.d33[ivar] = Am1 * v.d33[indx]; /* U_33*/
}
/* Calculation of (X,R) and*/
/* (U_X, U_R, U_3, U_XX, U_XR, U_X3, U_RR, U_R3, U_33)*/
AB_To_XR(nvar, A, B, &X, &R, U);
/* Calculation of (x,r) and*/
/* (U, U_x, U_r, U_3, U_xx, U_xr, U_x3, U_rr, U_r3, U_33)*/
C_To_c(nvar, X, R, &(s_x[i3D]), &r, U);
/* Calculation of (y,z) and*/
/* (U, U_x, U_y, U_z, U_xx, U_xy, U_xz, U_yy, U_yz, U_zz)*/
rx3_To_xyz(nvar, s_x[i3D], r, phi, &(s_y[i3D]), &(s_z[i3D]), U);
}
// Set_Rho_ADM(cctkGH, n1*n2*n3, sources, s_x, s_y, s_z); //external fortran code
free(s_z);
free(s_y);
free(s_x);
}
else
for (i = 0; i < n1; i++)
for (j = 0; j < n2; j++)
for (k = 0; k < n3; k++)
sources[Index(0, i, j, k, 1, n1, n2, n3)] = 0.0;
Derivatives_AB3_MatMul(nvar, n1, n2, n3, v);
double psi, psi2, psi4, psi7, r_plus, r_minus;
FILE *debugfile = NULL;
if (0)
{
debugfile = fopen("res.dat", "w");
assert(debugfile);
}
#pragma omp parallel for collapse(3) schedule(static) \
private(i, j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1, \
psi, psi2, psi4, psi7, r_plus, r_minus)
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
{
double l_values[1]; // nvar=1, stack-allocated
derivs l_U;
double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1];
double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1];
l_U.d0 = l_U_d0; l_U.d1 = l_U_d1; l_U.d2 = l_U_d2; l_U.d3 = l_U_d3;
l_U.d11 = l_U_d11; l_U.d12 = l_U_d12; l_U.d13 = l_U_d13;
l_U.d22 = l_U_d22; l_U.d23 = l_U_d23; l_U.d33 = l_U_d33;
al = Pih * (2 * i + 1) / n1;
A = -cos(al);
be = Pih * (2 * j + 1) / n2;
B = -cos(be);
phi = 2. * Pi * k / n3;
Am1 = A - 1;
for (ivar = 0; ivar < nvar; ivar++)
{
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
l_U.d0[ivar] = Am1 * v.d0[indx];
l_U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx];
l_U.d2[ivar] = Am1 * v.d2[indx];
l_U.d3[ivar] = Am1 * v.d3[indx];
l_U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx];
l_U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx];
l_U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx];
l_U.d22[ivar] = Am1 * v.d22[indx];
l_U.d23[ivar] = Am1 * v.d23[indx];
l_U.d33[ivar] = Am1 * v.d33[indx];
}
AB_To_XR(nvar, A, B, &X, &R, l_U);
C_To_c(nvar, X, R, &x, &r, l_U);
rx3_To_xyz(nvar, x, r, phi, &y, &z, l_U);
NonLinEquations(sources[Index(0, i, j, k, 1, n1, n2, n3)],
A, B, X, R, x, r, phi, y, z, l_U, l_values);
for (ivar = 0; ivar < nvar; ivar++)
{
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
F[indx] = l_values[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be);
u.d0[indx] = l_U.d0[ivar];
u.d1[indx] = l_U.d1[ivar];
u.d2[indx] = l_U.d2[ivar];
u.d3[indx] = l_U.d3[ivar];
u.d11[indx] = l_U.d11[ivar];
u.d12[indx] = l_U.d12[ivar];
u.d13[indx] = l_U.d13[ivar];
u.d22[indx] = l_U.d22[ivar];
u.d23[indx] = l_U.d23[ivar];
u.d33[indx] = l_U.d33[ivar];
}
}
}
}
if (debugfile)
{
fclose(debugfile);
}
free(sources);
}
/* --------------------------------------------------------------------------*/
double TwoPunctures::norm_inf(double const *F, int const ntotal)
{
double dmax = -1;
{
double dmax1 = -1;
for (int j = 0; j < ntotal; j++)
if (fabs(F[j]) > dmax1)
dmax1 = fabs(F[j]);
if (dmax1 > dmax)
dmax = dmax1;
}
return dmax;
}
/* --------------------------------------------------------------------------*/
int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const n3,
derivs v, derivs dv, int const itmax, double const tol,
double *normres)
{
int const output = 1;
int ntotal = n1 * n2 * n3 * nvar, ii;
double alpha = 0, beta = 0;
double rho = 0, rho1 = 1, rhotol = 1e-50;
double omega = 0, omegatol = 1e-50;
double *p, *rt, *s, *t, *r, *vv;
double **JFD;
int **cols, *ncols, maxcol = StencilSize * nvar;
double *F;
derivs u, ph, sh;
F = dvector(0, ntotal - 1);
allocate_derivs(&u, ntotal);
JFD = dmatrix(0, ntotal - 1, 0, maxcol - 1);
cols = imatrix(0, ntotal - 1, 0, maxcol - 1);
ncols = ivector(0, ntotal - 1);
F_of_v(nvar, n1, n2, n3, v, F, u);
SetMatrix_JFD(nvar, n1, n2, n3, u, ncols, cols, JFD);
/* temporary storage */
r = dvector(0, ntotal - 1);
p = dvector(0, ntotal - 1);
allocate_derivs(&ph, ntotal);
/* ph = dvector(0, ntotal-1);*/
rt = dvector(0, ntotal - 1);
s = dvector(0, ntotal - 1);
allocate_derivs(&sh, ntotal);
/* sh = dvector(0, ntotal-1);*/
t = dvector(0, ntotal - 1);
vv = dvector(0, ntotal - 1);
/* check */
if (output == 1)
{
printf("bicgstab: itmax %d, tol %e\n", itmax, (double)tol);
fflush(stdout);
}
/* compute initial residual rt = r = F - J*dv */
J_times_dv(nvar, n1, n2, n3, dv, r, u);
for (int j = 0; j < ntotal; j++)
rt[j] = r[j] = F[j] - r[j];
*normres = norm2(r, ntotal);
if (output == 1)
{
printf("bicgstab: %5d %10.3e\n", 0, (double)*normres);
fflush(stdout);
}
if (*normres <= tol)
return 0;
/* cgs iteration */
for (ii = 0; ii < itmax; ii++)
{
rho = scalarproduct(rt, r, ntotal);
if (fabs(rho) < rhotol)
break;
/* compute direction vector p */
if (ii == 0)
{
for (int j = 0; j < ntotal; j++)
p[j] = r[j];
}
else
{
beta = (rho / rho1) * (alpha / omega);
for (int j = 0; j < ntotal; j++)
p[j] = r[j] + beta * (p[j] - omega * vv[j]);
}
/* compute direction adjusting vector ph and scalar alpha */
for (int j = 0; j < ntotal; j++)
ph.d0[j] = 0;
for (int j = 0; j < NRELAX; j++) /* solves JFD*ph = p by relaxation*/
relax_omp(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD);
J_times_dv(nvar, n1, n2, n3, ph, vv, u); /* vv=J*ph*/
alpha = rho / scalarproduct(rt, vv, ntotal);
for (int j = 0; j < ntotal; j++)
s[j] = r[j] - alpha * vv[j];
/* early check of tolerance */
*normres = norm2(s, ntotal);
if (*normres <= tol)
{
for (int j = 0; j < ntotal; j++)
dv.d0[j] += alpha * ph.d0[j];
if (output == 1)
{
printf("bicgstab: %5d %10.3e %10.3e %10.3e %10.3e\n",
ii + 1, (double)*normres, (double)alpha, (double)beta, (double)omega);
fflush(stdout);
}
break;
}
/* compute stabilizer vector sh and scalar omega */
for (int j = 0; j < ntotal; j++)
sh.d0[j] = 0;
for (int j = 0; j < NRELAX; j++) /* solves JFD*sh = s by relaxation*/
relax_omp(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD);
J_times_dv(nvar, n1, n2, n3, sh, t, u); /* t=J*sh*/
omega = scalarproduct(t, s, ntotal) / scalarproduct(t, t, ntotal);
/* compute new solution approximation */
for (int j = 0; j < ntotal; j++)
{
dv.d0[j] += alpha * ph.d0[j] + omega * sh.d0[j];
r[j] = s[j] - omega * t[j];
}
/* are we done? */
*normres = norm2(r, ntotal);
if (output == 1)
{
printf("bicgstab: %5d %10.3e %10.3e %10.3e %10.3e\n",
ii + 1, (double)*normres, (double)alpha, (double)beta, (double)omega);
fflush(stdout);
}
if (*normres <= tol)
break;
rho1 = rho;
if (fabs(omega) < omegatol)
break;
}
/* free temporary storage */
free_dvector(r, 0, ntotal - 1);
free_dvector(p, 0, ntotal - 1);
/* free_dvector(ph, 0, ntotal-1);*/
free_derivs(&ph, ntotal);
free_dvector(rt, 0, ntotal - 1);
free_dvector(s, 0, ntotal - 1);
/* free_dvector(sh, 0, ntotal-1);*/
free_derivs(&sh, ntotal);
free_dvector(t, 0, ntotal - 1);
free_dvector(vv, 0, ntotal - 1);
free_dvector(F, 0, ntotal - 1);
free_derivs(&u, ntotal);
free_dmatrix(JFD, 0, ntotal - 1, 0, maxcol - 1);
free_imatrix(cols, 0, ntotal - 1, 0, maxcol - 1);
free_ivector(ncols, 0, ntotal - 1);
/* iteration failed */
if (ii > itmax)
return -1;
/* breakdown */
if (fabs(rho) < rhotol)
return -10;
if (fabs(omega) < omegatol)
return -11;
/* success! */
return ii + 1;
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::allocate_derivs(derivs *v, int n)
{
int m = n - 1;
(*v).d0 = dvector(0, m);
(*v).d1 = dvector(0, m);
(*v).d2 = dvector(0, m);
(*v).d3 = dvector(0, m);
(*v).d11 = dvector(0, m);
(*v).d12 = dvector(0, m);
(*v).d13 = dvector(0, m);
(*v).d22 = dvector(0, m);
(*v).d23 = dvector(0, m);
(*v).d33 = dvector(0, m);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::free_derivs(derivs *v, int n)
{
int m = n - 1;
free_dvector((*v).d0, 0, m);
free_dvector((*v).d1, 0, m);
free_dvector((*v).d2, 0, m);
free_dvector((*v).d3, 0, m);
free_dvector((*v).d11, 0, m);
free_dvector((*v).d12, 0, m);
free_dvector((*v).d13, 0, m);
free_dvector((*v).d22, 0, m);
free_dvector((*v).d23, 0, m);
free_dvector((*v).d33, 0, m);
}
/* --------------------------------------------------------------------------*/
int TwoPunctures::Index(int ivar, int i, int j, int k, int nvar, int n1, int n2, int n3)
{
int i1 = i, j1 = j, k1 = k;
if (i1 < 0)
i1 = -(i1 + 1);
if (i1 >= n1)
i1 = 2 * n1 - (i1 + 1);
if (j1 < 0)
j1 = -(j1 + 1);
if (j1 >= n2)
j1 = 2 * n2 - (j1 + 1);
if (k1 < 0)
k1 = k1 + n3;
if (k1 >= n3)
k1 = k1 - n3;
return ivar + nvar * (i1 + n1 * (j1 + n2 * k1));
}
/*-----------------------------------------------------------*/
/******** Nonlinear Equations ***********/
/*-----------------------------------------------------------*/
void TwoPunctures::NonLinEquations(double rho_adm,
double A, double B, double X, double R,
double x, double r, double phi,
double y, double z, derivs U, double *values)
{
double r_plus, r_minus, psi, psi2, psi4, psi7;
double mu;
r_plus = sqrt((x - par_b) * (x - par_b) + y * y + z * z);
r_minus = sqrt((x + par_b) * (x + par_b) + y * y + z * z);
psi = 1. + 0.5 * par_m_plus / r_plus + 0.5 * par_m_minus / r_minus + U.d0[0];
psi2 = psi * psi;
psi4 = psi2 * psi2;
psi7 = psi * psi2 * psi4;
values[0] = U.d11[0] + U.d22[0] + U.d33[0] + 0.125 * BY_KKofxyz(x, y, z) / psi7 + 2.0 * Pi / psi2 / psi * rho_adm;
}
double TwoPunctures::BY_KKofxyz(double x, double y, double z)
{
int i, j;
double r_plus, r2_plus, r3_plus, r_minus, r2_minus, r3_minus, np_Pp, nm_Pm,
Aij, AijAij, n_plus[3], n_minus[3], np_Sp[3], nm_Sm[3];
r2_plus = (x - par_b) * (x - par_b) + y * y + z * z;
r2_minus = (x + par_b) * (x + par_b) + y * y + z * z;
r_plus = sqrt(r2_plus);
r_minus = sqrt(r2_minus);
r3_plus = r_plus * r2_plus;
r3_minus = r_minus * r2_minus;
n_plus[0] = (x - par_b) / r_plus;
n_minus[0] = (x + par_b) / r_minus;
n_plus[1] = y / r_plus;
n_minus[1] = y / r_minus;
n_plus[2] = z / r_plus;
n_minus[2] = z / r_minus;
/* dot product: np_Pp = (n_+).(P_+); nm_Pm = (n_-).(P_-) */
np_Pp = 0;
nm_Pm = 0;
for (i = 0; i < 3; i++)
{
np_Pp += n_plus[i] * par_P_plus[i];
nm_Pm += n_minus[i] * par_P_minus[i];
}
/* cross product: np_Sp[i] = [(n_+) x (S_+)]_i; nm_Sm[i] = [(n_-) x (S_-)]_i*/
np_Sp[0] = n_plus[1] * par_S_plus[2] - n_plus[2] * par_S_plus[1];
np_Sp[1] = n_plus[2] * par_S_plus[0] - n_plus[0] * par_S_plus[2];
np_Sp[2] = n_plus[0] * par_S_plus[1] - n_plus[1] * par_S_plus[0];
nm_Sm[0] = n_minus[1] * par_S_minus[2] - n_minus[2] * par_S_minus[1];
nm_Sm[1] = n_minus[2] * par_S_minus[0] - n_minus[0] * par_S_minus[2];
nm_Sm[2] = n_minus[0] * par_S_minus[1] - n_minus[1] * par_S_minus[0];
AijAij = 0;
for (i = 0; i < 3; i++)
{
for (j = 0; j < 3; j++)
{ /* Bowen-York-Curvature :*/
Aij =
+1.5 * (par_P_plus[i] * n_plus[j] + par_P_plus[j] * n_plus[i] + np_Pp * n_plus[i] * n_plus[j]) / r2_plus + 1.5 * (par_P_minus[i] * n_minus[j] + par_P_minus[j] * n_minus[i] + nm_Pm * n_minus[i] * n_minus[j]) / r2_minus - 3.0 * (np_Sp[i] * n_plus[j] + np_Sp[j] * n_plus[i]) / r3_plus - 3.0 * (nm_Sm[i] * n_minus[j] + nm_Sm[j] * n_minus[i]) / r3_minus;
if (i == j)
Aij -= +1.5 * (np_Pp / r2_plus + nm_Pm / r2_minus);
AijAij += Aij * Aij;
}
}
return AijAij;
}
void TwoPunctures::SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u,
int *ncols, int **cols, double **Matrix)
{
int column, row, mcol;
int i, i1, i_0, i_1, j, j1, j_0, j_1, k, k1, k_0, k_1, N1, N2, N3,
ivar, ivar1, ntotal = nvar * n1 * n2 * n3;
double *values;
derivs dv;
values = dvector(0, nvar - 1);
allocate_derivs(&dv, ntotal);
N1 = n1 - 1;
N2 = n2 - 1;
N3 = n3 - 1;
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
{
for (ivar = 0; ivar < nvar; ivar++)
{
row = Index(ivar, i, j, k, nvar, n1, n2, n3);
ncols[row] = 0;
dv.d0[row] = 0;
}
}
}
}
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
{
for (ivar = 0; ivar < nvar; ivar++)
{
column = Index(ivar, i, j, k, nvar, n1, n2, n3);
dv.d0[column] = 1;
i_0 = maximum2(0, i - 1);
i_1 = minimum2(N1, i + 1);
j_0 = maximum2(0, j - 1);
j_1 = minimum2(N2, j + 1);
k_0 = k - 1;
k_1 = k + 1;
/* i_0 = 0;
i_1 = N1;
j_0 = 0;
j_1 = N2;
k_0 = 0;
k_1 = N3;*/
for (i1 = i_0; i1 <= i_1; i1++)
{
for (j1 = j_0; j1 <= j_1; j1++)
{
for (k1 = k_0; k1 <= k_1; k1++)
{
JFD_times_dv(i1, j1, k1, nvar, n1, n2, n3,
dv, u, values);
for (ivar1 = 0; ivar1 < nvar; ivar1++)
{
if (values[ivar1] != 0)
{
row = Index(ivar1, i1, j1, k1, nvar, n1, n2, n3);
mcol = ncols[row];
cols[row][mcol] = column;
Matrix[row][mcol] = values[ivar1];
ncols[row] += 1;
}
}
}
}
}
dv.d0[column] = 0;
}
}
}
}
free_derivs(&dv, ntotal);
free_dvector(values, 0, nvar - 1);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u)
{ /* Calculates the left hand sides of the non-linear equations F_m(v_n)=0*/
/* and the function u (u.d0[]) as well as its derivatives*/
/* (u.d1[], u.d2[], u.d3[], u.d11[], u.d12[], u.d13[], u.d22[], u.d23[], u.d33[])*/
/* at interior points and at the boundaries "+/-"*/
int i, j, k, ivar, indx;
double al, be, A, B, X, R, x, r, phi, y, z, Am1;
Derivatives_AB3_MatMul(nvar, n1, n2, n3, dv);
#pragma omp parallel for schedule(static) \
private(j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1)
for (i = 0; i < n1; i++)
{
// Thread-local derivs on stack (nvar=1)
double l_val[1];
double l_dU_d0[1], l_dU_d1[1], l_dU_d2[1], l_dU_d3[1];
double l_dU_d11[1], l_dU_d12[1], l_dU_d13[1], l_dU_d22[1], l_dU_d23[1], l_dU_d33[1];
double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1];
double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1];
derivs l_dU, l_U;
l_dU.d0=l_dU_d0; l_dU.d1=l_dU_d1; l_dU.d2=l_dU_d2; l_dU.d3=l_dU_d3;
l_dU.d11=l_dU_d11; l_dU.d12=l_dU_d12; l_dU.d13=l_dU_d13;
l_dU.d22=l_dU_d22; l_dU.d23=l_dU_d23; l_dU.d33=l_dU_d33;
l_U.d0=l_U_d0; l_U.d1=l_U_d1; l_U.d2=l_U_d2; l_U.d3=l_U_d3;
l_U.d11=l_U_d11; l_U.d12=l_U_d12; l_U.d13=l_U_d13;
l_U.d22=l_U_d22; l_U.d23=l_U_d23; l_U.d33=l_U_d33;
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
{
al = Pih * (2 * i + 1) / n1;
A = -cos(al);
be = Pih * (2 * j + 1) / n2;
B = -cos(be);
phi = 2. * Pi * k / n3;
Am1 = A - 1;
for (ivar = 0; ivar < nvar; ivar++)
{
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
l_dU.d0[ivar] = Am1 * dv.d0[indx];
l_dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx];
l_dU.d2[ivar] = Am1 * dv.d2[indx];
l_dU.d3[ivar] = Am1 * dv.d3[indx];
l_dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx];
l_dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx];
l_dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx];
l_dU.d22[ivar] = Am1 * dv.d22[indx];
l_dU.d23[ivar] = Am1 * dv.d23[indx];
l_dU.d33[ivar] = Am1 * dv.d33[indx];
l_U.d0[ivar] = u.d0[indx];
l_U.d1[ivar] = u.d1[indx];
l_U.d2[ivar] = u.d2[indx];
l_U.d3[ivar] = u.d3[indx];
l_U.d11[ivar] = u.d11[indx];
l_U.d12[ivar] = u.d12[indx];
l_U.d13[ivar] = u.d13[indx];
l_U.d22[ivar] = u.d22[indx];
l_U.d23[ivar] = u.d23[indx];
l_U.d33[ivar] = u.d33[indx];
}
AB_To_XR(nvar, A, B, &X, &R, l_dU);
C_To_c(nvar, X, R, &x, &r, l_dU);
rx3_To_xyz(nvar, x, r, phi, &y, &z, l_dU);
LinEquations(A, B, X, R, x, r, phi, y, z, l_dU, l_U, l_val);
for (ivar = 0; ivar < nvar; ivar++)
{
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
Jdv[indx] = l_val[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be);
}
}
}
}
}
/* --------------------------------------------------------------------------*/
/* --------------------------------------------------------------------------
* relax_omp: OpenMP-parallelized replacement for relax()
*
* Parallelism analysis:
* - The red-black ordering within each phi-plane means that
* same-parity lines in the i-direction are INDEPENDENT of each other
* (they only couple through the j-direction which is solved internally).
* - Similarly, same-parity lines in the j-direction are independent.
* - Different phi-planes (k) with same parity are independent.
*
* Strategy:
* - Parallelize the i-loop within each (k, parity) group for LineRelax_be
* - Parallelize the j-loop within each (k, parity) group for LineRelax_al
* - Each thread uses its own pre-allocated workspace (tid-indexed)
* --------------------------------------------------------------------------*/
void TwoPunctures::relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD)
{
int n;
// 偶数k平面
for (n = 0; n < N_PlaneRelax; n++)
{
// 偶数i线所有偶数k —— 不同k平面完全独立
int n_even_k = (n3 + 1) / 2; // 偶数k的数量
int n_even_i = (n1 - 2 + 1) / 2; // i=2,4,...的数量
int total_tasks = n_even_k * n_even_i;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_even_i;
int ii = task % n_even_i;
int k = ki * 2;
int i = 2 + ii * 2;
LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 奇数i线所有偶数k
int n_odd_i = n1 / 2; // i=1,3,...的数量
total_tasks = n_even_k * n_odd_i;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_odd_i;
int ii = task % n_odd_i;
int k = ki * 2;
int i = 1 + ii * 2;
LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 奇数j线所有偶数k
int n_odd_j = (n2 - 1 + 1) / 2;
total_tasks = n_even_k * n_odd_j;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_odd_j;
int ji = task % n_odd_j;
int k = ki * 2;
int j = 1 + ji * 2;
LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 偶数j线所有偶数k
int n_even_j = (n2 + 1) / 2;
total_tasks = n_even_k * n_even_j;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_even_j;
int ji = task % n_even_j;
int k = ki * 2;
int j = ji * 2;
LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 奇数k平面 — 同样的四步
int n_odd_k = n3 / 2;
// 偶数i线所有奇数k
n_even_i = (n1 + 1) / 2; // i=0,2,...
total_tasks = n_odd_k * n_even_i;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_even_i;
int ii = task % n_even_i;
int k = 1 + ki * 2;
int i = ii * 2;
LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 奇数i线所有奇数k
total_tasks = n_odd_k * n_odd_i;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_odd_i;
int ii = task % n_odd_i;
int k = 1 + ki * 2;
int i = 1 + ii * 2;
LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 奇数j线所有奇数k
total_tasks = n_odd_k * n_odd_j;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_odd_j;
int ji = task % n_odd_j;
int k = 1 + ki * 2;
int j = 1 + ji * 2;
LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
// 偶数j线所有奇数k
total_tasks = n_odd_k * n_even_j;
#pragma omp parallel for schedule(static)
for (int task = 0; task < total_tasks; task++) {
int tid = omp_get_thread_num();
int ki = task / n_even_j;
int ji = task % n_even_j;
int k = 1 + ki * 2;
int j = ji * 2;
LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
}
}
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid)
{
int j, m, Ic, Ip, Im, col, ivar;
// Use pre-allocated per-thread workspace instead of new/delete
double *diag = ws_diag_be[tid];
double *e = ws_e_be[tid];
double *f = ws_f_be[tid];
double *b = ws_b_be[tid];
double *x = ws_x_be[tid];
for (ivar = 0; ivar < nvar; ivar++)
{
for (j = 0; j < n2 - 1; j++)
{
diag[j] = e[j] = f[j] = 0;
}
diag[n2 - 1] = 0;
// gsl_vector_set_zero(diag);
// gsl_vector_set_zero(e);
// gsl_vector_set_zero(f);
for (j = 0; j < n2; j++)
{
Ip = Index(ivar, i, j + 1, k, nvar, n1, n2, n3);
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
Im = Index(ivar, i, j - 1, k, nvar, n1, n2, n3);
b[j] = rhs[Ic];
// gsl_vector_set(b,j,rhs[Ic]);
for (m = 0; m < ncols[Ic]; m++)
{
col = cols[Ic][m];
if (col != Ip && col != Ic && col != Im)
b[j] -= JFD[Ic][m] * dv[col];
// *gsl_vector_ptr(b, j) -= JFD[Ic][m] * dv[col];
else
{
if (col == Im && j > 0)
f[j - 1] = JFD[Ic][m];
// gsl_vector_set(f,j-1,JFD[Ic][m]);
if (col == Ic)
diag[j] = JFD[Ic][m];
// gsl_vector_set(diag,j,JFD[Ic][m]);
if (col == Ip && j < n2 - 1)
e[j] = JFD[Ic][m];
// gsl_vector_set(e,j,JFD[Ic][m]);
}
}
}
ThomasAlgorithm_ws(n2, f, diag, e, x, b,
ws_l_be[tid], ws_u_be[tid], ws_d_be[tid], ws_y_be[tid]);
for (j = 0; j < n2; j++)
{
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
dv[Ic] = x[j];
// dv[Ic] = gsl_vector_get(x, j);
}
}
// No delete — workspace is persistent
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values)
{ /* Calculates rows of the vector 'J(FD)*dv'.*/
/* First row to be calculated: row = Index(0, i, j, k; nvar, n1, n2, n3)*/
/* Last row to be calculated: row = Index(nvar-1, i, j, k; nvar, n1, n2, n3)*/
/* These rows are stored in the vector JFDdv[0] ... JFDdv[nvar-1].*/
int ivar, indx;
double al, be, A, B, X, R, x, r, phi, y, z, Am1;
double sin_al, sin_al_i1, sin_al_i2, sin_al_i3, cos_al;
double sin_be, sin_be_i1, sin_be_i2, sin_be_i3, cos_be;
double dV0, dV1, dV2, dV3, dV11, dV12, dV13, dV22, dV23, dV33,
ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp;
// Stack-allocated derivs (nvar=1) — no malloc/free!
double dU_d0[1], dU_d1[1], dU_d2[1], dU_d3[1];
double dU_d11[1], dU_d12[1], dU_d13[1], dU_d22[1], dU_d23[1], dU_d33[1];
double U_d0[1], U_d1[1], U_d2[1], U_d3[1];
double U_d11[1], U_d12[1], U_d13[1], U_d22[1], U_d23[1], U_d33[1];
derivs dU, U;
dU.d0=dU_d0; dU.d1=dU_d1; dU.d2=dU_d2; dU.d3=dU_d3;
dU.d11=dU_d11; dU.d12=dU_d12; dU.d13=dU_d13;
dU.d22=dU_d22; dU.d23=dU_d23; dU.d33=dU_d33;
U.d0=U_d0; U.d1=U_d1; U.d2=U_d2; U.d3=U_d3;
U.d11=U_d11; U.d12=U_d12; U.d13=U_d13;
U.d22=U_d22; U.d23=U_d23; U.d33=U_d33;
if (k < 0)
k = k + n3;
if (k >= n3)
k = k - n3;
ha = Pi / n1; /* ha: Stepsize with respect to (al)*/
al = ha * (i + 0.5);
A = -cos(al);
ga = 1 / ha;
ga2 = ga * ga;
hb = Pi / n2; /* hb: Stepsize with respect to (be)*/
be = hb * (j + 0.5);
B = -cos(be);
gb = 1 / hb;
gb2 = gb * gb;
gagb = ga * gb;
hp = 2 * Pi / n3; /* hp: Stepsize with respect to (phi)*/
phi = hp * j;
gp = 1 / hp;
gp2 = gp * gp;
gagp = ga * gp;
gbgp = gb * gp;
sin_al = sin(al);
sin_be = sin(be);
sin_al_i1 = 1 / sin_al;
sin_be_i1 = 1 / sin_be;
sin_al_i2 = sin_al_i1 * sin_al_i1;
sin_be_i2 = sin_be_i1 * sin_be_i1;
sin_al_i3 = sin_al_i1 * sin_al_i2;
sin_be_i3 = sin_be_i1 * sin_be_i2;
cos_al = -A;
cos_be = -B;
Am1 = A - 1;
for (ivar = 0; ivar < nvar; ivar++)
{
int iccc = Index(ivar, i, j, k, nvar, n1, n2, n3),
ipcc = Index(ivar, i + 1, j, k, nvar, n1, n2, n3),
imcc = Index(ivar, i - 1, j, k, nvar, n1, n2, n3),
icpc = Index(ivar, i, j + 1, k, nvar, n1, n2, n3),
icmc = Index(ivar, i, j - 1, k, nvar, n1, n2, n3),
iccp = Index(ivar, i, j, k + 1, nvar, n1, n2, n3),
iccm = Index(ivar, i, j, k - 1, nvar, n1, n2, n3),
icpp = Index(ivar, i, j + 1, k + 1, nvar, n1, n2, n3),
icmp = Index(ivar, i, j - 1, k + 1, nvar, n1, n2, n3),
icpm = Index(ivar, i, j + 1, k - 1, nvar, n1, n2, n3),
icmm = Index(ivar, i, j - 1, k - 1, nvar, n1, n2, n3),
ipcp = Index(ivar, i + 1, j, k + 1, nvar, n1, n2, n3),
imcp = Index(ivar, i - 1, j, k + 1, nvar, n1, n2, n3),
ipcm = Index(ivar, i + 1, j, k - 1, nvar, n1, n2, n3),
imcm = Index(ivar, i - 1, j, k - 1, nvar, n1, n2, n3),
ippc = Index(ivar, i + 1, j + 1, k, nvar, n1, n2, n3),
impc = Index(ivar, i - 1, j + 1, k, nvar, n1, n2, n3),
ipmc = Index(ivar, i + 1, j - 1, k, nvar, n1, n2, n3),
immc = Index(ivar, i - 1, j - 1, k, nvar, n1, n2, n3);
/* Derivatives of (dv) w.r.t. (al,be,phi):*/
dV0 = dv.d0[iccc];
dV1 = 0.5 * ga * (dv.d0[ipcc] - dv.d0[imcc]);
dV2 = 0.5 * gb * (dv.d0[icpc] - dv.d0[icmc]);
dV3 = 0.5 * gp * (dv.d0[iccp] - dv.d0[iccm]);
dV11 = ga2 * (dv.d0[ipcc] + dv.d0[imcc] - 2 * dv.d0[iccc]);
dV22 = gb2 * (dv.d0[icpc] + dv.d0[icmc] - 2 * dv.d0[iccc]);
dV33 = gp2 * (dv.d0[iccp] + dv.d0[iccm] - 2 * dv.d0[iccc]);
dV12 = 0.25 * gagb * (dv.d0[ippc] - dv.d0[ipmc] + dv.d0[immc] - dv.d0[impc]);
dV13 = 0.25 * gagp * (dv.d0[ipcp] - dv.d0[imcp] + dv.d0[imcm] - dv.d0[ipcm]);
dV23 = 0.25 * gbgp * (dv.d0[icpp] - dv.d0[icpm] + dv.d0[icmm] - dv.d0[icmp]);
/* Derivatives of (dv) w.r.t. (A,B,phi):*/
dV11 = sin_al_i3 * (sin_al * dV11 - cos_al * dV1);
dV12 = sin_al_i1 * sin_be_i1 * dV12;
dV13 = sin_al_i1 * dV13;
dV22 = sin_be_i3 * (sin_be * dV22 - cos_be * dV2);
dV23 = sin_be_i1 * dV23;
dV1 = sin_al_i1 * dV1;
dV2 = sin_be_i1 * dV2;
/* Derivatives of (dU) w.r.t. (A,B,phi):*/
dU.d0[ivar] = Am1 * dV0;
dU.d1[ivar] = dV0 + Am1 * dV1;
dU.d2[ivar] = Am1 * dV2;
dU.d3[ivar] = Am1 * dV3;
dU.d11[ivar] = 2 * dV1 + Am1 * dV11;
dU.d12[ivar] = dV2 + Am1 * dV12;
dU.d13[ivar] = dV3 + Am1 * dV13;
dU.d22[ivar] = Am1 * dV22;
dU.d23[ivar] = Am1 * dV23;
dU.d33[ivar] = Am1 * dV33;
indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
U.d0[ivar] = u.d0[indx]; /* U */
U.d1[ivar] = u.d1[indx]; /* U_x*/
U.d2[ivar] = u.d2[indx]; /* U_y*/
U.d3[ivar] = u.d3[indx]; /* U_z*/
U.d11[ivar] = u.d11[indx]; /* U_xx*/
U.d12[ivar] = u.d12[indx]; /* U_xy*/
U.d13[ivar] = u.d13[indx]; /* U_xz*/
U.d22[ivar] = u.d22[indx]; /* U_yy*/
U.d23[ivar] = u.d23[indx]; /* U_yz*/
U.d33[ivar] = u.d33[indx]; /* U_zz*/
}
/* Calculation of (X,R) and*/
/* (dU_X, dU_R, dU_3, dU_XX, dU_XR, dU_X3, dU_RR, dU_R3, dU_33)*/
AB_To_XR(nvar, A, B, &X, &R, dU);
/* Calculation of (x,r) and*/
/* (dU, dU_x, dU_r, dU_3, dU_xx, dU_xr, dU_x3, dU_rr, dU_r3, dU_33)*/
C_To_c(nvar, X, R, &x, &r, dU);
/* Calculation of (y,z) and*/
/* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/
rx3_To_xyz(nvar, x, r, phi, &y, &z, dU);
LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
double FAC_val = sin_al * sin_be * sin_al * sin_be * sin_al * sin_be;
for (ivar = 0; ivar < nvar; ivar++)
values[ivar] *= FAC_val;
// No free_derivs needed — everything is on the stack
}
#undef FAC
/*-----------------------------------------------------------*/
/******** Linear Equations ***********/
/*-----------------------------------------------------------*/
void TwoPunctures::LinEquations(double A, double B, double X, double R,
double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values)
{
double r_plus, r_minus, psi, psi2, psi4, psi8;
r_plus = sqrt((x - par_b) * (x - par_b) + y * y + z * z);
r_minus = sqrt((x + par_b) * (x + par_b) + y * y + z * z);
psi =
1. + 0.5 * par_m_plus / r_plus + 0.5 * par_m_minus / r_minus + U.d0[0];
psi2 = psi * psi;
psi4 = psi2 * psi2;
psi8 = psi4 * psi4;
values[0] = dU.d11[0] + dU.d22[0] + dU.d33[0] - 0.875 * BY_KKofxyz(x, y, z) / psi8 * dU.d0[0];
}
/* -------------------------------------------------------------------------*/
void TwoPunctures::LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid)
{
int i, m, Ic, Ip, Im, col, ivar;
double *diag = ws_diag_al[tid];
double *e = ws_e_al[tid];
double *f = ws_f_al[tid];
double *b = ws_b_al[tid];
double *x = ws_x_al[tid];
for (ivar = 0; ivar < nvar; ivar++)
{
for (i = 0; i < n1 - 1; i++)
{
diag[i] = e[i] = f[i] = 0;
}
diag[n1 - 1] = 0;
// gsl_vector_set_zero(diag);
// gsl_vector_set_zero(e);
// gsl_vector_set_zero(f);
for (i = 0; i < n1; i++)
{
Ip = Index(ivar, i + 1, j, k, nvar, n1, n2, n3);
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
Im = Index(ivar, i - 1, j, k, nvar, n1, n2, n3);
b[i] = rhs[Ic];
// gsl_vector_set(b,i,rhs[Ic]);
for (m = 0; m < ncols[Ic]; m++)
{
col = cols[Ic][m];
if (col != Ip && col != Ic && col != Im)
b[i] -= JFD[Ic][m] * dv[col];
// *gsl_vector_ptr(b, i) -= JFD[Ic][m] * dv[col];
else
{
if (col == Im && i > 0)
f[i - 1] = JFD[Ic][m];
// gsl_vector_set(f,i-1,JFD[Ic][m]);
if (col == Ic)
diag[i] = JFD[Ic][m];
// gsl_vector_set(diag,i,JFD[Ic][m]);
if (col == Ip && i < n1 - 1)
e[i] = JFD[Ic][m];
// gsl_vector_set(e,i,JFD[Ic][m]);
}
}
}
ThomasAlgorithm_ws(n1, f, diag, e, x, b,
ws_l_al[tid], ws_u_al[tid], ws_d_al[tid], ws_y_al[tid]);
for (i = 0; i < n1; i++)
{
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
dv[Ic] = x[i];
// dv[Ic] = gsl_vector_get(x, i);
}
}
}
/* -------------------------------------------------------------------------*/
// a[N], b[N-1], c[N-1], x[N], q[N]
// A x = q
// A = ( a_0 c_0 0 0 )
// ( b_0 a_1 c_1 0 )
// ( 0 b_1 a_2 c_2 )
// ( 0 0 b_2 a_3 )
//"Parallel Scientific Computing in C++ and MPI" P361
void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q)
{
int i;
double *l, *u, *d, *y;
l = new double[N - 1];
u = new double[N - 1];
d = new double[N];
y = new double[N];
/* LU Decomposition */
d[0] = a[0];
u[0] = c[0];
for (i = 0; i < N - 2; i++)
{
l[i] = b[i] / d[i];
d[i + 1] = a[i + 1] - l[i] * u[i];
u[i + 1] = c[i + 1];
}
l[N - 2] = b[N - 2] / d[N - 2];
d[N - 1] = a[N - 1] - l[N - 2] * u[N - 2];
/* Forward Substitution [L][y] = [q] */
y[0] = q[0];
for (i = 1; i < N; i++)
y[i] = q[i] - l[i - 1] * y[i - 1];
/* Backward Substitution [U][x] = [y] */
x[N - 1] = y[N - 1] / d[N - 1];
for (i = N - 2; i >= 0; i--)
x[i] = (y[i] - u[i] * x[i + 1]) / d[i];
delete[] l;
delete[] u;
delete[] d;
delete[] y;
return;
}
// ThomasAlgorithm with pre-allocated workspace (no new/delete)
// l[N-1], u_ws[N-1], d[N], y[N] are caller-provided workspace
void TwoPunctures::ThomasAlgorithm_ws(int N, double *b, double *a, double *c,
double *x, double *q,
double *l, double *u_ws, double *d, double *y)
{
/* LU Decomposition */
d[0] = a[0];
u_ws[0] = c[0];
for (int i = 0; i < N - 2; i++) {
l[i] = b[i] / d[i];
d[i + 1] = a[i + 1] - l[i] * u_ws[i];
u_ws[i + 1] = c[i + 1];
}
l[N - 2] = b[N - 2] / d[N - 2];
d[N - 1] = a[N - 1] - l[N - 2] * u_ws[N - 2];
/* Forward Substitution [L][y] = [q] */
y[0] = q[0];
for (int i = 1; i < N; i++)
y[i] = q[i] - l[i - 1] * y[i - 1];
/* Backward Substitution [U][x] = [y] */
x[N - 1] = y[N - 1] / d[N - 1];
for (int i = N - 2; i >= 0; i--)
x[i] = (y[i] - u_ws[i] * x[i + 1]) / d[i];
}
// --------------------------------------------------------------------------*/
// Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/
/* --------------------------------------------------------------------------*/
/* Calculates the value of v at an arbitrary position (A,B,phi)*/
double TwoPunctures::Spec_IntPolABphiFast(parameters par, double *v, int ivar, double A, double B, double phi)
{
int i, j, k, N;
double *p, *values1, **values2, result;
int nvar = par.nvar;
int n1 = par.n1;
int n2 = par.n2;
int n3 = par.n3;
N = maximum3(n1, n2, n3);
p = dvector(0, N);
values1 = dvector(0, N);
values2 = dmatrix(0, N, 0, N);
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
{
for (i = 0; i < n1; i++)
p[i] = v[ivar + nvar * (i + n1 * (j + n2 * k))];
// chebft_Zeros (p, n1, 0);
values2[j][k] = chebev(-1, 1, p, n1, A);
}
}
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
p[j] = values2[j][k];
// chebft_Zeros (p, n2, 0);
values1[k] = chebev(-1, 1, p, n2, B);
}
// fourft (values1, n3, 0);
result = fourev(values1, n3, phi);
free_dvector(p, 0, N);
free_dvector(values1, 0, N);
free_dmatrix(values2, 0, N, 0, N);
return result;
// */
// return 0.;
}
/* Calculates the value of v at an arbitrary position (x,y,z) given the spectral coefficients*/
double TwoPunctures::Spec_IntPolFast(parameters par, int ivar, double *v, double x, double y, double z)
{
double xs, ys, zs, rs2, phi, X, R, A, B, aux1, aux2, result, Ui;
int nvar = par.nvar;
int n1 = par.n1;
int n2 = par.n2;
int n3 = par.n3;
double par_b = par.b;
xs = x / par.b;
ys = y / par.b;
zs = z / par.b;
rs2 = ys * ys + zs * zs;
phi = atan2(z, y);
if (phi < 0)
phi += 2. * Pi;
aux1 = 0.5 * (xs * xs + rs2 - 1.);
aux2 = sqrt(aux1 * aux1 + rs2);
// Note from YT: aux2-aux1 can be equal to 1. When that happens, numerical
// truncation may make it slightly larger than 1. This makes
// R NAN! I also worry that aux2-aux1 and aux1+axu2 may become negative due to
// truncation error, which gives rise to NAN for X and R.
// The following few lines attempt to fix these.
double aux2_plus_aux1, aux2_minus_aux1;
if (aux1 < 0)
{
aux2_plus_aux1 = rs2 / (aux2 - aux1);
aux2_minus_aux1 = aux2 - aux1;
}
else
{
aux2_plus_aux1 = aux2 + aux1;
aux2_minus_aux1 = rs2 / (aux2 + aux1);
}
if (fabs(aux1) + fabs(aux2) < 1.e-20)
{
aux2_plus_aux1 = 0.0;
aux2_minus_aux1 = 0.0;
}
double sqrt_aux2_minus_aux1 = sqrt(fabs(aux2_minus_aux1));
// Note from YT: The following two lines have replaced by the 6 lines belows.
// X = asinhd(sqrt(aux1+aux2));
// R = asin(sqrt(fabs(-aux1+aux2)));
X = asinh(sqrt(aux2_plus_aux1));
if (sqrt_aux2_minus_aux1 > 1.0)
{
R = 0.5 * Pi;
}
else
{
R = asin(sqrt_aux2_minus_aux1);
}
if (x < 0)
R = Pi - R;
A = 2. * tanh(0.5 * X) - 1.;
B = tan(0.5 * R - Piq);
result = Spec_IntPolABphiFast(par, v, ivar, A, B, phi);
Ui = (A - 1) * result;
return Ui;
}
// Evaluates the spectral expansion coefficients of v
void TwoPunctures::SpecCoef(parameters par, int ivar, double *v, double *cf)
{
// Here v is a pointer to the values of the variable v at the collocation points
int i, j, k, N, n, l, m;
double *p, ***values3, ***values4;
int nvar = par.nvar;
int n1 = par.n1;
int n2 = par.n2;
int n3 = par.n3;
N = maximum3(n1, n2, n3);
p = dvector(0, N);
values3 = d3tensor(0, n1, 0, n2, 0, n3);
values4 = d3tensor(0, n1, 0, n2, 0, n3);
// Caclulate values3[n,j,k] = a_n^{j,k} = (sum_i^(n1-1) f(A_i,B_j,phi_k) Tn(-A_i))/k_n , k_n = N/2 or N
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
{
for (i = 0; i < n1; i++)
p[i] = v[ivar + (i + n1 * (j + n2 * k))];
chebft_Zeros(p, n1, 0);
for (n = 0; n < n1; n++)
{
values3[n][j][k] = p[n];
}
}
}
// Caclulate values4[n,l,k] = a_{n,l}^{k} = (sum_j^(n2-1) a_n^{j,k} Tn(B_j))/k_l , k_l = N/2 or N
for (n = 0; n < n1; n++)
{
for (k = 0; k < n3; k++)
{
for (j = 0; j < n2; j++)
p[j] = values3[n][j][k];
chebft_Zeros(p, n2, 0);
for (l = 0; l < n2; l++)
{
values4[n][l][k] = p[l];
}
}
}
// Caclulate coefficients a_{n,l,m} = (sum_k^(n3-1) a_{n,m}^{k} fourier(phi_k))/k_m , k_m = N/2 or N
for (i = 0; i < n1; i++)
{
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
p[k] = values4[i][j][k];
fourft(p, n3, 0);
for (k = 0; k < n3; k++)
{
cf[ivar + (i + n1 * (j + n2 * k))] = p[k];
}
}
}
free_dvector(p, 0, N);
free_d3tensor(values3, 0, n1, 0, n2, 0, n3);
free_d3tensor(values4, 0, n1, 0, n2, 0, n3);
}
void TwoPunctures::allocate_workspace()
{
int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
max_threads = omp_get_max_threads();
printf("Allocating workspace for %d threads\n", max_threads);
// LineRelax_be workspace: arrays of size n2, per thread
ws_diag_be = new double*[max_threads];
ws_e_be = new double*[max_threads];
ws_f_be = new double*[max_threads];
ws_b_be = new double*[max_threads];
ws_x_be = new double*[max_threads];
ws_l_be = new double*[max_threads];
ws_u_be = new double*[max_threads];
ws_d_be = new double*[max_threads];
ws_y_be = new double*[max_threads];
// LineRelax_al workspace: arrays of size n1, per thread
ws_diag_al = new double*[max_threads];
ws_e_al = new double*[max_threads];
ws_f_al = new double*[max_threads];
ws_b_al = new double*[max_threads];
ws_x_al = new double*[max_threads];
ws_l_al = new double*[max_threads];
ws_u_al = new double*[max_threads];
ws_d_al = new double*[max_threads];
ws_y_al = new double*[max_threads];
int N = (n1 > n2) ? n1 : n2; // max of n1, n2
for (int t = 0; t < max_threads; t++) {
ws_diag_be[t] = new double[n2];
ws_e_be[t] = new double[n2];
ws_f_be[t] = new double[n2];
ws_b_be[t] = new double[n2];
ws_x_be[t] = new double[n2];
ws_l_be[t] = new double[n2];
ws_u_be[t] = new double[n2];
ws_d_be[t] = new double[n2];
ws_y_be[t] = new double[n2];
ws_diag_al[t] = new double[n1];
ws_e_al[t] = new double[n1];
ws_f_al[t] = new double[n1];
ws_b_al[t] = new double[n1];
ws_x_al[t] = new double[n1];
ws_l_al[t] = new double[n1];
ws_u_al[t] = new double[n1];
ws_d_al[t] = new double[n1];
ws_y_al[t] = new double[n1];
}
}
void TwoPunctures::free_workspace()
{
for (int t = 0; t < max_threads; t++) {
delete[] ws_diag_be[t]; delete[] ws_e_be[t]; delete[] ws_f_be[t];
delete[] ws_b_be[t]; delete[] ws_x_be[t];
delete[] ws_l_be[t]; delete[] ws_u_be[t];
delete[] ws_d_be[t]; delete[] ws_y_be[t];
delete[] ws_diag_al[t]; delete[] ws_e_al[t]; delete[] ws_f_al[t];
delete[] ws_b_al[t]; delete[] ws_x_al[t];
delete[] ws_l_al[t]; delete[] ws_u_al[t];
delete[] ws_d_al[t]; delete[] ws_y_al[t];
}
delete[] ws_diag_be; delete[] ws_e_be; delete[] ws_f_be;
delete[] ws_b_be; delete[] ws_x_be;
delete[] ws_l_be; delete[] ws_u_be;
delete[] ws_d_be; delete[] ws_y_be;
delete[] ws_diag_al; delete[] ws_e_al; delete[] ws_f_al;
delete[] ws_b_al; delete[] ws_x_al;
delete[] ws_l_al; delete[] ws_u_al;
delete[] ws_d_al; delete[] ws_y_al;
}
/*==========================================================================
* Precomputed Spectral Derivative Matrices
*
* Mathematical equivalence proof:
*
* Original algorithm (per-line):
* 1. Forward Chebyshev transform: c = T * f (where T is the DCT matrix)
* 2. Spectral derivative: c' = Dhat * c (recurrence relation)
* 3. Inverse transform: f' = T^{-1} * c'
* Combined: f' = T^{-1} * Dhat * T * f = D * f
*
* The matrix D = T^{-1} * Dhat * T is precomputed once.
* Similarly D2 = T^{-1} * Dhat^2 * T for second derivatives.
*
* For Fourier: same idea with DFT matrices and frequency-domain derivatives.
*
* This converts n2*n3 separate O(n1^2) transforms into a single
* (n1 x n1) * (n1 x n2*n3) DGEMM call, which is BLAS Level-3
* and thus optimally parallelized by MKL.
*=========================================================================*/
void TwoPunctures::build_cheb_deriv_matrices(int n, double *D1, double *D2)
{
/* Build the physical-space derivative matrices for Chebyshev Zeros grid.
*
* Grid points: x_i = -cos(pi*(2i+1)/(2n)), i=0,...,n-1
*
* Method: Construct T (forward transform), Dhat (spectral derivative),
* T^{-1} (inverse transform), then D1 = T^{-1} * Dhat * T,
* D2 = T^{-1} * Dhat^2 * T.
*
* All matrices are n x n, stored in row-major order: M[i*n+j]
*/
double *T_fwd = new double[n * n]; // Forward transform matrix
double *T_inv = new double[n * n]; // Inverse transform matrix
double *Dhat = new double[n * n]; // Spectral derivative operator
double *Dhat2 = new double[n * n]; // Spectral second derivative operator
double *tmp1 = new double[n * n]; // Temporary
double *tmp2 = new double[n * n]; // Temporary
double Pion = Pi / n;
// Build forward Chebyshev transform matrix T
// c_j = (2/n) * (-1)^j * sum_k f_k * cos(pi*j*(k+0.5)/n)
// So T[j][k] = (2/n) * (-1)^j * cos(pi*j*(k+0.5)/n)
for (int j = 0; j < n; j++) {
double fac = (2.0 / n) * ((j % 2 == 0) ? 1.0 : -1.0);
for (int k = 0; k < n; k++) {
T_fwd[j * n + k] = fac * cos(Pion * j * (k + 0.5));
}
}
// Build inverse Chebyshev transform matrix T^{-1}
// f_j = sum_k c_k * cos(pi*(j+0.5)*k/n) * (-1)^k - 0.5*c_0
// But the -0.5*c_0 term is part of the sum when we write it as:
// f_j = -0.5*c_0 + sum_{k=0}^{n-1} c_k * cos(pi*(j+0.5)*k) * (-1)^k
// T_inv[j][k] = cos(pi*(j+0.5)*k/n) * (-1)^k, with k=0 term having extra -0.5
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
double sign_k = (k % 2 == 0) ? 1.0 : -1.0;
T_inv[j * n + k] = cos(Pion * (j + 0.5) * k) * sign_k;
}
// The k=0 term needs adjustment: the sum includes c_0*1 but we need -0.5*c_0 + c_0*1 = 0.5*c_0
// Wait, let me re-examine chebft_Zeros with inv=1:
// sum = -0.5 * u[0];
// for k: sum += u[k] * cos(Pion*(j+0.5)*k) * isignum; isignum alternates starting from 1
// So: c[j] = -0.5*u[0] + sum_{k=0}^{n-1} u[k]*cos(...)*(-1)^k
// = -0.5*u[0] + u[0]*1*1 + sum_{k=1} ...
// = 0.5*u[0] + sum_{k=1} u[k]*cos(...)*(-1)^k
// Equivalently: T_inv[j][0] = 0.5, T_inv[j][k] = cos(...)*(-1)^k for k>=1
// But cos(0) = 1 and (-1)^0 = 1, so the formula gives T_inv[j][0] = 1.0
// We need it to be 0.5. Fix:
T_inv[j * n + 0] = 0.5; // This accounts for the -0.5*u[0] + u[0]*cos(0)*1 = 0.5*u[0]
}
// Build spectral derivative matrix Dhat (in coefficient space)
// The recurrence: cder[n-1] = 0, cder[n-2] = 0,
// cder[j] = cder[j+2] + 2*(j+1)*c[j+1] for j = n-3,...,0
// This means cder = Dhat * c, where Dhat is upper triangular-ish.
// Dhat[j][k] = coefficient of c[k] contributing to cder[j]
//
// From the recurrence: cder[j] = sum_{k=j+1, k-j odd}^{n-1} 2*k * c[k]
// (with the factor 2k, summing over k > j where k-j is odd)
// Exception: cder[0] gets an extra factor of 0.5 since c[0] has the 2/n prefactor
// Actually no: the chder function is:
// cder[n] = cder[n-1] = 0
// cder[j] = cder[j+2] + 2*(j+1)*c[j+1]
// Unrolling: cder[j] = 2*(j+1)*c[j+1] + 2*(j+3)*c[j+3] + ...
// So Dhat[j][k] = 2*k if k > j and (k-j) is odd, else 0
for (int j = 0; j < n; j++)
for (int k = 0; k < n; k++)
Dhat[j * n + k] = 0.0;
for (int j = 0; j < n; j++) {
for (int k = j + 1; k < n; k++) {
if ((k - j) % 2 == 1) {
Dhat[j * n + k] = 2.0 * k;
}
}
}
// Build Dhat^2 = Dhat * Dhat
// D1 = T_inv * Dhat * T_fwd
// D2 = T_inv * Dhat^2 * T_fwd
// tmp1 = Dhat * T_fwd
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, Dhat, n, T_fwd, n, 0.0, tmp1, n);
// D1 = T_inv * tmp1
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D1, n);
// tmp2 = Dhat * Dhat (Dhat^2 in spectral space)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, Dhat, n, Dhat, n, 0.0, tmp2, n);
// tmp1 = Dhat^2 * T_fwd
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, tmp2, n, T_fwd, n, 0.0, tmp1, n);
// D2 = T_inv * tmp1
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D2, n);
delete[] T_fwd;
delete[] T_inv;
delete[] Dhat;
delete[] Dhat2;
delete[] tmp1;
delete[] tmp2;
}
void TwoPunctures::build_fourier_deriv_matrices(int N, double *DF1, double *DF2)
{
/* Build Fourier derivative matrices in physical space.
*
* Grid: phi_k = 2*pi*k/N, k=0,...,N-1
*
* The Fourier interpolant derivative at grid points can be expressed as
* a matrix multiply. We build it by:
* 1. Forward Fourier transform matrix F
* 2. Frequency-domain derivative (multiply by il for first, -l^2 for second)
* 3. Inverse Fourier transform matrix F^{-1}
* DF1 = F^{-1} * diag(il) * F, DF2 = F^{-1} * diag(-l^2) * F
*
* But since fourft/fourev use a real representation (a_l, b_l),
* we construct directly in physical space.
*/
int M = N / 2;
double Pi_fac = Pi / M; // = 2*Pi/N
// DF1[j][k] = d/dphi of the interpolant at phi_j, due to value at phi_k
// Using the representation:
// f(phi) = 0.5*(a_0 + a_M*cos(M*phi)) + sum_{l=1}^{M-1} (a_l*cos(l*phi) + b_l*sin(l*phi))
// where a_l = (2/N)*sum_k f_k*cos(l*phi_k), b_l = (2/N)*sum_k f_k*sin(l*phi_k)
//
// f'(phi) = -0.5*a_M*M*sin(M*phi) + sum_{l=1}^{M-1} l*(-a_l*sin(l*phi) + b_l*cos(l*phi))
//
// Substituting a_l, b_l and evaluating at phi_j:
// f'(phi_j) = sum_k f_k * K(j,k)
// where K(j,k) = (2/N) * sum_{l=1}^{M-1} l * (-cos(l*phi_k)*sin(l*phi_j) + sin(l*phi_k)*cos(l*phi_j))
// + (2/N) * (-M/2) * sin(M*phi_j) * cos(M*phi_k) [a_M term, note a_M has no factor 2]
// = (2/N) * sum_{l=1}^{M-1} l * sin(l*(phi_k - phi_j))
// - (1/N) * M * sin(M*phi_j) * cos(M*phi_k)
//
// But the a_M coefficient in fourft has factor 1/M (not 2/M), so:
// Actually re-examining fourft: a[l] = fac * sum_k u[k]*cos(x), fac=1/M
// and a_M is stored as a[M] with same fac. The inverse uses:
// u[k] = 0.5*(a[0] + a[M]*iy) + sum_{l=1}^{M-1}(a[l]*cos + b[l]*sin)
// So the full expression needs care. Let me just compute it numerically.
// Numerical approach: for each k, set f = delta_k, compute derivative at all j
double *p = new double[N];
double *dp = new double[N];
for (int k = 0; k < N; k++) {
// Set delta function at k
for (int i = 0; i < N; i++)
p[i] = (i == k) ? 1.0 : 0.0;
// Forward Fourier transform (using existing fourft)
fourft(p, N, 0);
// Derivative in spectral space
fourder(p, dp, N);
// Inverse Fourier transform
fourft(dp, N, 1);
// dp[j] = derivative of delta_k interpolant at phi_j
// So DF1[j][k] = dp[j]
for (int j = 0; j < N; j++)
DF1[j * N + k] = dp[j];
}
// Second derivative
for (int k = 0; k < N; k++) {
for (int i = 0; i < N; i++)
p[i] = (i == k) ? 1.0 : 0.0;
fourft(p, N, 0);
fourder2(p, dp, N);
fourft(dp, N, 1);
for (int j = 0; j < N; j++)
DF2[j * N + k] = dp[j];
}
delete[] p;
delete[] dp;
}
void TwoPunctures::precompute_derivative_matrices()
{
int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
// Allocate matrices
D1_A = new double[n1 * n1];
D2_A = new double[n1 * n1];
D1_B = new double[n2 * n2];
D2_B = new double[n2 * n2];
DF1_phi = new double[n3 * n3];
DF2_phi = new double[n3 * n3];
// Build Chebyshev derivative matrices
build_cheb_deriv_matrices(n1, D1_A, D2_A);
build_cheb_deriv_matrices(n2, D1_B, D2_B);
// Build Fourier derivative matrices
build_fourier_deriv_matrices(n3, DF1_phi, DF2_phi);
printf("Precomputed derivative matrices: A(%d), B(%d), phi(%d)\n", n1, n2, n3);
}
/* --------------------------------------------------------------------------
* Derivatives_AB3_MatMul: Drop-in replacement for Derivatives_AB3
*
* Uses precomputed derivative matrices and DGEMM to compute all spectral
* derivatives in batch. Mathematically equivalent to the original
* Derivatives_AB3.
*
* Memory layout of v.d0[Index(ivar,i,j,k)] = v.d0[ivar + nvar*(i + n1*(j + n2*k))]
*
* For A-direction derivatives (fixed j,k, varying i):
* We need to apply D1_A and D2_A to "pencils" along the i-direction.
* Collect all pencils into a matrix and use DGEMM.
*
* For B-direction derivatives (fixed i,k, varying j):
* Similarly with D1_B, D2_B.
*
* For phi-direction (fixed i,j, varying k):
* Similarly with DF1_phi, DF2_phi.
* --------------------------------------------------------------------------*/
void TwoPunctures::Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v)
{
int total_pencils;
double *data_in, *data_out;
/*=====================================================
* STEP 1: A-direction derivatives (Chebyshev, D1_A, D2_A)
*
* For each (ivar, j, k), we have a pencil of length n1:
* f[i] = v.d0[Index(ivar, i, j, k, nvar, n1, n2, n3)]
*
* We want: v.d1[...] = D1_A * f, v.d11[...] = D2_A * f
*
* Collect all n2*n3*nvar pencils as columns of a matrix:
* data_in[i, col] where col = ivar + nvar*(j + n2*k)
* Then: data_out = D1_A * data_in (DGEMM: n1 x n1 times n1 x total_pencils)
*=====================================================*/
total_pencils = nvar * n2 * n3;
data_in = new double[n1 * total_pencils];
data_out = new double[n1 * total_pencils];
// Gather: data_in[i * total_pencils + col] = v.d0[Index(ivar,i,j,k,...)]
// where col = ivar + nvar * (j + n2 * k)
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (j + n2 * k);
for (int i = 0; i < n1; i++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
data_in[i * total_pencils + col] = v.d0[indx];
}
}
}
}
// First derivative: data_out = D1_A * data_in
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n1, total_pencils, n1,
1.0, D1_A, n1, data_in, total_pencils,
0.0, data_out, total_pencils);
// Scatter to v.d1
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (j + n2 * k);
for (int i = 0; i < n1; i++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d1[indx] = data_out[i * total_pencils + col];
}
}
}
}
// Second derivative: data_out = D2_A * data_in
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n1, total_pencils, n1,
1.0, D2_A, n1, data_in, total_pencils,
0.0, data_out, total_pencils);
// Scatter to v.d11
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (j + n2 * k);
for (int i = 0; i < n1; i++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d11[indx] = data_out[i * total_pencils + col];
}
}
}
}
delete[] data_in;
delete[] data_out;
/*=====================================================
* STEP 2: B-direction derivatives (Chebyshev, D1_B, D2_B)
*
* Pencils along j for each (ivar, i, k).
* Also compute mixed derivative v.d12 = D1_B applied to v.d1
*=====================================================*/
total_pencils = nvar * n1 * n3;
data_in = new double[n2 * total_pencils];
data_out = new double[n2 * total_pencils];
double *data_in2 = new double[n2 * total_pencils];
double *data_out2 = new double[n2 * total_pencils];
// Gather v.d0 along B-direction AND v.d1 for mixed derivative
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int i = 0; i < n1; i++) {
int col = ivar + nvar * (i + n1 * k);
for (int j = 0; j < n2; j++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
data_in[j * total_pencils + col] = v.d0[indx];
data_in2[j * total_pencils + col] = v.d1[indx]; // for d/dB of (dv/dA)
}
}
}
}
// v.d2 = D1_B * v.d0 (along B)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n2, total_pencils, n2,
1.0, D1_B, n2, data_in, total_pencils,
0.0, data_out, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int i = 0; i < n1; i++) {
int col = ivar + nvar * (i + n1 * k);
for (int j = 0; j < n2; j++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d2[indx] = data_out[j * total_pencils + col];
}
}
}
}
// v.d22 = D2_B * v.d0
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n2, total_pencils, n2,
1.0, D2_B, n2, data_in, total_pencils,
0.0, data_out, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int i = 0; i < n1; i++) {
int col = ivar + nvar * (i + n1 * k);
for (int j = 0; j < n2; j++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d22[indx] = data_out[j * total_pencils + col];
}
}
}
}
// v.d12 = D1_B * v.d1 (mixed: d/dB of dv/dA)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n2, total_pencils, n2,
1.0, D1_B, n2, data_in2, total_pencils,
0.0, data_out2, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int k = 0; k < n3; k++) {
for (int i = 0; i < n1; i++) {
int col = ivar + nvar * (i + n1 * k);
for (int j = 0; j < n2; j++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d12[indx] = data_out2[j * total_pencils + col];
}
}
}
}
delete[] data_in;
delete[] data_out;
delete[] data_in2;
delete[] data_out2;
/*=====================================================
* STEP 3: phi-direction derivatives (Fourier, DF1_phi, DF2_phi)
*
* Pencils along k for each (ivar, i, j).
* Also compute mixed derivatives v.d13, v.d23
*=====================================================*/
total_pencils = nvar * n1 * n2;
data_in = new double[n3 * total_pencils];
data_out = new double[n3 * total_pencils];
data_in2 = new double[n3 * total_pencils]; // for v.d1 → v.d13
data_out2 = new double[n3 * total_pencils];
double *data_in3 = new double[n3 * total_pencils]; // for v.d2 → v.d23
double *data_out3 = new double[n3 * total_pencils];
// Gather v.d0, v.d1, v.d2 along phi-direction
for (int ivar = 0; ivar < nvar; ivar++) {
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (i + n1 * j);
for (int k = 0; k < n3; k++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
data_in[k * total_pencils + col] = v.d0[indx];
data_in2[k * total_pencils + col] = v.d1[indx];
data_in3[k * total_pencils + col] = v.d2[indx];
}
}
}
}
// v.d3 = DF1_phi * v.d0
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n3, total_pencils, n3,
1.0, DF1_phi, n3, data_in, total_pencils,
0.0, data_out, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (i + n1 * j);
for (int k = 0; k < n3; k++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d3[indx] = data_out[k * total_pencils + col];
}
}
}
}
// v.d33 = DF2_phi * v.d0
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n3, total_pencils, n3,
1.0, DF2_phi, n3, data_in, total_pencils,
0.0, data_out, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (i + n1 * j);
for (int k = 0; k < n3; k++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d33[indx] = data_out[k * total_pencils + col];
}
}
}
}
// v.d13 = DF1_phi * v.d1 (mixed: d/dphi of dv/dA)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n3, total_pencils, n3,
1.0, DF1_phi, n3, data_in2, total_pencils,
0.0, data_out2, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (i + n1 * j);
for (int k = 0; k < n3; k++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d13[indx] = data_out2[k * total_pencils + col];
}
}
}
}
// v.d23 = DF1_phi * v.d2 (mixed: d/dphi of dv/dB)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n3, total_pencils, n3,
1.0, DF1_phi, n3, data_in3, total_pencils,
0.0, data_out3, total_pencils);
for (int ivar = 0; ivar < nvar; ivar++) {
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
int col = ivar + nvar * (i + n1 * j);
for (int k = 0; k < n3; k++) {
int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
v.d23[indx] = data_out3[k * total_pencils + col];
}
}
}
}
delete[] data_in;
delete[] data_out;
delete[] data_in2;
delete[] data_out2;
delete[] data_in3;
delete[] data_out3;
}