* /Erik 001109
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/pme.h"
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "smalloc.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "macros.h"
#include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/fileio/futil.h"
#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
/* Turn on arbitrary width SIMD intrinsics for PME solve */
-#define PME_SIMD_SOLVE
+# define PME_SIMD_SOLVE
#endif
#define PME_GRID_QA 0 /* Gridindex for A-state for Q */
/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
/* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#if (defined GMX_SIMD4_HAVE_REAL)
/* Do PME spread and gather with 4-wide SIMD.
* NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
*/
-#define PME_SIMD4_SPREAD_GATHER
+# define PME_SIMD4_SPREAD_GATHER
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
/* With PME-order=4 on x86, unaligned load+store is slightly faster
* than doubling all SIMD operations when using aligned load+store.
*/
-#define PME_SIMD4_UNALIGNED
-#endif
+# define PME_SIMD4_UNALIGNED
+# endif
#endif
#define DFT_TOL 1e-7
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
-#define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
+# define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
#else
/* We can use any alignment, apart from 0, so we use 4 reals */
-#define SIMD4_ALIGNMENT (4*sizeof(real))
+# define SIMD4_ALIGNMENT (4*sizeof(real))
#endif
/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
int n;
int nalloc;
rvec *x;
- real *q;
+ real *coefficient;
rvec *f;
gmx_bool bSpread; /* These coordinates are used for spreading */
int pme_order;
* the lower cell boundary
*/
int nthread;
- int *thread_idx; /* Which thread should spread which charge */
+ int *thread_idx; /* Which thread should spread which coefficient */
thread_plist_t *thread_plist;
splinedata_t *spline;
} pme_atomcomm_t;
typedef struct {
#ifdef PME_SIMD4_SPREAD_GATHER
/* Masks for 4-wide SIMD aligned spreading and gathering */
- gmx_simd4_pb mask_S0[6], mask_S1[6];
+ gmx_simd4_bool_t mask_S0[6], mask_S1[6];
#else
- int dummy; /* C89 requires that struct has at least one member */
+ int dummy; /* C89 requires that struct has at least one member */
#endif
} pme_spline_work_t;
int pme_order;
real epsilon_r;
+ int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
+
int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
* This can probably be done in a better way
* but this simple hack works for now
*/
- /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+ /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
int pmegrid_nx, pmegrid_ny, pmegrid_nz;
/* pmegrid_nz might be larger than strictly necessary to ensure
* memory alignment, pmegrid_nz_base gives the real base size.
splinevec bsp_mod;
/* Buffers to store data for local atoms for L-B combination rule
* calculations in LJ-PME. lb_buf1 stores either the coefficients
- * for spreading/gathering (in serial), or the C6 parameters for
+ * for spreading/gathering (in serial), or the C6 coefficient for
* local atoms (in parallel). lb_buf2 is only used in parallel,
* and stores the sigma values for local atoms. */
real *lb_buf1, *lb_buf2;
/* thread local work data for solve_pme */
pme_work_t *work;
- /* Work data for PME_redist */
- gmx_bool redist_init;
- int * scounts;
- int * rcounts;
- int * sdispls;
- int * rdispls;
- int * sidx;
- int * idxa;
- real * redist_buf;
- int redist_buf_nalloc;
-
/* Work data for sum_qgrid */
real * sum_qgrid_tmp;
real * sum_qgrid_dd_tmp;
} t_gmx_pme;
static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
- int start, int end, int thread)
+ int start, int grid_index, int end, int thread)
{
int i;
int *idxptr, tix, tiy, tiz;
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
- g2tx = pme->pmegrid[PME_GRID_QA].g2t[XX];
- g2ty = pme->pmegrid[PME_GRID_QA].g2t[YY];
- g2tz = pme->pmegrid[PME_GRID_QA].g2t[ZZ];
+ g2tx = pme->pmegrid[grid_index].g2t[XX];
+ g2ty = pme->pmegrid[grid_index].g2t[YY];
+ g2tz = pme->pmegrid[grid_index].g2t[ZZ];
bThreads = (atc->nthread > 1);
if (bThreads)
if (atc->nslab > 1)
{
srenew(atc->x, atc->nalloc);
- srenew(atc->q, atc->nalloc);
+ srenew(atc->coefficient, atc->nalloc);
srenew(atc->f, atc->nalloc);
for (i = nalloc_old; i < atc->nalloc; i++)
{
}
}
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool gmx_unused forw,
- int n, gmx_bool gmx_unused bXF, rvec gmx_unused *x_f,
- real gmx_unused *charge, pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate */
-{
- int *idxa;
- int i, ii;
-
- if (FALSE == pme->redist_init)
- {
- snew(pme->scounts, atc->nslab);
- snew(pme->rcounts, atc->nslab);
- snew(pme->sdispls, atc->nslab);
- snew(pme->rdispls, atc->nslab);
- snew(pme->sidx, atc->nslab);
- pme->redist_init = TRUE;
- }
- if (n > pme->redist_buf_nalloc)
- {
- pme->redist_buf_nalloc = over_alloc_dd(n);
- srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
- }
-
- pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
- if (forw && bXF)
- {
- /* forward, redistribution from pp to pme */
-
- /* Calculate send counts and exchange them with other nodes */
- for (i = 0; (i < atc->nslab); i++)
- {
- pme->scounts[i] = 0;
- }
- for (i = 0; (i < n); i++)
- {
- pme->scounts[pme->idxa[i]]++;
- }
- MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
- /* Calculate send and receive displacements and index into send
- buffer */
- pme->sdispls[0] = 0;
- pme->rdispls[0] = 0;
- pme->sidx[0] = 0;
- for (i = 1; i < atc->nslab; i++)
- {
- pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
- pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
- pme->sidx[i] = pme->sdispls[i];
- }
- /* Total # of particles to be received */
- atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
- pme_realloc_atomcomm_things(atc);
-
- /* Copy particle coordinates into send buffer and exchange*/
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii+XX] = x_f[i][XX];
- pme->redist_buf[ii+YY] = x_f[i][YY];
- pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
- pme->rvec_mpi, atc->mpi_comm);
- }
- if (forw)
- {
- /* Copy charge into send buffer and exchange*/
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii] = charge[i];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
- atc->q, pme->rcounts, pme->rdispls, mpi_type,
- atc->mpi_comm);
- }
- else /* backward, redistribution from pme to pp */
- {
- MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
- pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->mpi_comm);
-
- /* Copy data from receive buffer */
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- x_f[i][XX] += pme->redist_buf[ii+XX];
- x_f[i][YY] += pme->redist_buf[ii+YY];
- x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
- pme->sidx[pme->idxa[i]]++;
- }
- }
-#endif
-}
-
static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
gmx_bool gmx_unused bBackward, int gmx_unused shift,
void gmx_unused *buf_s, int gmx_unused nbyte_s,
#endif
}
-static void dd_pmeredist_x_q(gmx_pme_t pme,
- int n, gmx_bool bX, rvec *x, real *charge,
- pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+ int n, gmx_bool bX, rvec *x, real *data,
+ pme_atomcomm_t *atc)
{
int *commnode, *buf_index;
int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
{
if (atc->count[atc->nodeid] + nsend != n)
{
- gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+ gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
"This usually means that your system is not well equilibrated.",
n - (atc->count[atc->nodeid] + nsend),
pme->nodeid, 'x'+atc->dimind);
/* Communicate the count */
if (debug)
{
- fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+ fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
atc->dimind, atc->nodeid, commnode[i], scount);
}
pme_dd_sendrecv(atc, FALSE, i,
{
copy_rvec(x[i], atc->x[local_pos]);
}
- atc->q[local_pos] = charge[i];
+ atc->coefficient[local_pos] = data[i];
local_pos++;
}
else
{
copy_rvec(x[i], pme->bufv[buf_index[node]]);
}
- pme->bufr[buf_index[node]] = charge[i];
+ pme->bufr[buf_index[node]] = data[i];
buf_index[node]++;
}
}
pme->bufv[buf_pos], scount*sizeof(rvec),
atc->x[local_pos], rcount*sizeof(rvec));
}
- /* Communicate the charges */
+ /* Communicate the coefficients */
pme_dd_sendrecv(atc, FALSE, i,
pme->bufr+buf_pos, scount*sizeof(real),
- atc->q+local_pos, rcount*sizeof(real));
+ atc->coefficient+local_pos, rcount*sizeof(real));
buf_pos += scount;
local_pos += atc->rcount[i];
}
/* Since we have already (un)wrapped the overlap in the z-dimension,
* we only have to communicate 0 to nkz (not pmegrid_nz).
*/
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
/* Copy data to contiguous send buffer */
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_iy,
send_index0-pme->pmegrid_start_iy,
/* Get data from contiguous recv buffer */
if (debug)
{
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_iy,
recv_index0-pme->pmegrid_start_iy,
for (k = 0; k < pme->nkz; k++)
{
iz = k;
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
}
for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
{
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix+send_nindex);
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_ix,
recv_index0-pme->pmegrid_start_ix,
overlap->mpi_comm, &stat);
/* ADD data from contiguous recv buffer */
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
for (i = 0; i < recv_nindex*datasize; i++)
{
#ifdef DEBUG_PME
FILE *fp, *fp2;
- char fn[STRLEN], format[STRLEN];
+ char fn[STRLEN];
real val;
sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
- fp = ffopen(fn, "w");
+ fp = gmx_ffopen(fn, "w");
sprintf(fn, "pmegrid%d.txt", pme->nodeid);
- fp2 = ffopen(fn, "w");
- sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+ fp2 = gmx_ffopen(fn, "w");
#endif
for (ix = 0; ix < local_fft_ndata[XX]; ix++)
val = 100*pmegrid[pmeidx];
if (pmegrid[pmeidx] != 0)
{
- fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
- 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+ gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+ 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
}
if (pmegrid[pmeidx] != 0)
{
}
}
#ifdef DEBUG_PME
- ffclose(fp);
- ffclose(fp2);
+ gmx_ffclose(fp);
+ gmx_ffclose(fp2);
#endif
}
return 0;
for (ithx = 0; (ithx < order); ithx++) \
{ \
index_x = (i0+ithx)*pny*pnz; \
- valx = qn*thx[ithx]; \
+ valx = coefficient*thx[ithx]; \
\
for (ithy = 0; (ithy < order); ithy++) \
{ \
}
-static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
- pme_atomcomm_t *atc,
- splinedata_t *spline,
- pme_spline_work_t gmx_unused *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
+ pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ pme_spline_work_t gmx_unused *work)
{
- /* spread charges from home atoms to local grid */
+ /* spread coefficients from home atoms to local grid */
real *grid;
pme_overlap_t *ol;
int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
int * idxptr;
int order, norder, index_x, index_xy, index_xyz;
- real valx, valxy, qn;
+ real valx, valxy, coefficient;
real *thx, *thy, *thz;
int localsize, bndsize;
int pnx, pny, pnz, ndatatot;
int offx, offy, offz;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
#endif
pnx = pmegrid->s[XX];
for (nn = 0; nn < spline->n; nn++)
{
- n = spline->ind[nn];
- qn = atc->q[n];
+ n = spline->ind[nn];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = nn*order;
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(5);
#endif
* elements at the end for padding.
*/
#ifdef PME_SIMD_SOLVE
- simd_width = GMX_SIMD_WIDTH_HERE;
+ simd_width = GMX_SIMD_REAL_WIDTH;
#else
/* We can use any alignment, apart from 0, so we use 4 */
simd_width = 4;
}
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_EXP
+#if defined PME_SIMD_SOLVE
/* Calculate exponentials through SIMD */
-inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
{
- const gmx_mm_pr two = gmx_set1_pr(2.0);
- gmx_mm_pr f_simd;
- gmx_mm_pr lu;
- gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+ const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+ gmx_simd_real_t f_simd;
+ gmx_simd_real_t lu;
+ gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_simd = gmx_set1_pr(f);
+ f_simd = gmx_simd_set1_r(f);
/* We only need to calculate from start. But since start is 0 or 1
* and we want to use aligned loads/stores, we always start from 0.
*/
- for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
{
- tmp_d1 = gmx_load_pr(d_aligned+kx);
- d_inv = gmx_inv_pr(tmp_d1);
- tmp_r = gmx_load_pr(r_aligned+kx);
- tmp_r = gmx_exp_pr(tmp_r);
- tmp_e = gmx_mul_pr(f_simd, d_inv);
- tmp_e = gmx_mul_pr(tmp_e, tmp_r);
- gmx_store_pr(e_aligned+kx, tmp_e);
+ tmp_d1 = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d1);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ tmp_e = gmx_simd_mul_r(f_simd, d_inv);
+ tmp_e = gmx_simd_mul_r(tmp_e, tmp_r);
+ gmx_simd_store_r(e_aligned+kx, tmp_e);
}
}
}
#else
-inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
{
int kx;
for (kx = start; kx < end; kx++)
}
#endif
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_ERFC
+#if defined PME_SIMD_SOLVE
/* Calculate exponentials through SIMD */
-inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
{
- gmx_mm_pr tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
- const gmx_mm_pr sqr_PI = gmx_sqrt_pr(gmx_set1_pr(M_PI));
+ gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+ const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
int kx;
- for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
{
/* We only need to calculate from start. But since start is 0 or 1
* and we want to use aligned loads/stores, we always start from 0.
*/
- tmp_d = gmx_load_pr(d_aligned+kx);
- d_inv = gmx_inv_pr(tmp_d);
- gmx_store_pr(d_aligned+kx, d_inv);
- tmp_r = gmx_load_pr(r_aligned+kx);
- tmp_r = gmx_exp_pr(tmp_r);
- gmx_store_pr(r_aligned+kx, tmp_r);
- tmp_mk = gmx_load_pr(factor_aligned+kx);
- tmp_fac = gmx_mul_pr(sqr_PI, gmx_mul_pr(tmp_mk, gmx_erfc_pr(tmp_mk)));
- gmx_store_pr(factor_aligned+kx, tmp_fac);
+ tmp_d = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d);
+ gmx_simd_store_r(d_aligned+kx, d_inv);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ gmx_simd_store_r(r_aligned+kx, tmp_r);
+ tmp_mk = gmx_simd_load_r(factor_aligned+kx);
+ tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+ gmx_simd_store_r(factor_aligned+kx, tmp_fac);
}
}
#else
-inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
{
int kx;
real mk;
work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
+
/* This energy should be corrected for a charged system */
work->energy_lj = 0.5*energy;
}
int index_x, index_xy;
int nx, ny, nz, pnx, pny, pnz;
int * idxptr;
- real tx, ty, dx, dy, qn;
+ real tx, ty, dx, dy, coefficient;
real fx, fy, fz, gval;
real fxy1, fz1;
real *thx, *thy, *thz, *dthx, *dthy, *dthz;
pme_spline_work_t *work;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
- real dthz_buffer[12], *dthz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+ real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
- dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
+ dthz_aligned = gmx_simd4_align_r(dthz_buffer);
#endif
work = pme->spline_work;
for (nn = 0; nn < spline->n; nn++)
{
- n = spline->ind[nn];
- qn = scale*atc->q[n];
+ n = spline->ind[nn];
+ coefficient = scale*atc->coefficient[n];
if (bClearF)
{
atc->f[n][YY] = 0;
atc->f[n][ZZ] = 0;
}
- if (qn != 0)
+ if (coefficient != 0)
{
fx = 0;
fy = 0;
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(5);
#endif
break;
}
- atc->f[n][XX] += -qn*( fx*nx*rxx );
- atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
- atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+ atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+ atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+ atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
}
}
/* Since the energy and not forces are interpolated
int n, ithx, ithy, ithz, i0, j0, k0;
int index_x, index_xy;
int * idxptr;
- real energy, pot, tx, ty, qn, gval;
+ real energy, pot, tx, ty, coefficient, gval;
real *thx, *thy, *thz;
int norder;
int order;
energy = 0;
for (n = 0; (n < atc->n); n++)
{
- qn = atc->q[n];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = n*order;
}
}
- energy += pot*qn;
+ energy += pot*coefficient;
}
}
}
void make_bsplines(splinevec theta, splinevec dtheta, int order,
- rvec fractx[], int nr, int ind[], real charge[],
+ rvec fractx[], int nr, int ind[], real coefficient[],
gmx_bool bDoSplines)
{
/* construct splines for local atoms */
for (i = 0; i < nr; i++)
{
- /* With free energy we do not use the charge check.
+ /* With free energy we do not use the coefficient check.
* In most cases this will be more efficient than calling make_bsplines
- * twice, since usually more than half the particles have charges.
+ * twice, since usually more than half the particles have non-zero coefficients.
*/
ii = ind[i];
- if (bDoSplines || charge[ii] != 0.0)
+ if (bDoSplines || coefficient[ii] != 0.0)
{
xptr = fractx[ii];
switch (order)
if (atc->nslab > 1)
{
- /* These three allocations are not required for particle decomp. */
snew(atc->node_dest, atc->nslab);
snew(atc->node_src, atc->nslab);
setup_coordinate_communication(atc);
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
- * Because grid overlap communication only goes forward,
- * the grid the slabs for fft's should be rounded down.
+ * Since in calc_pidx we divide particles, and not grid lines,
+ * spatially uniform along dimension x or y, we need to round
+ * s2g0 down and s2g1 up.
*/
ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
pme_spline_work_t *work;
#ifdef PME_SIMD4_SPREAD_GATHER
- real tmp[12], *tmp_aligned;
- gmx_simd4_pr zero_S;
- gmx_simd4_pr real_mask_S0, real_mask_S1;
- int of, i;
+ real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+ gmx_simd4_real_t zero_S;
+ gmx_simd4_real_t real_mask_S0, real_mask_S1;
+ int of, i;
snew_aligned(work, 1, SIMD4_ALIGNMENT);
- tmp_aligned = gmx_simd4_align_real(tmp);
+ tmp_aligned = gmx_simd4_align_r(tmp);
- zero_S = gmx_simd4_setzero_pr();
+ zero_S = gmx_simd4_setzero_r();
/* Generate bit masks to mask out the unused grid entries,
* as we only operate on order of the 8 grid entries that are
* load into 2 SIMD registers.
*/
- for (of = 0; of < 8-(order-1); of++)
+ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
{
- for (i = 0; i < 8; i++)
+ for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
{
tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
}
- real_mask_S0 = gmx_simd4_load_pr(tmp_aligned);
- real_mask_S1 = gmx_simd4_load_pr(tmp_aligned+4);
- work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
- work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
+ real_mask_S0 = gmx_simd4_load_r(tmp_aligned);
+ real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+ work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+ work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
}
#else
work = NULL;
*bValidSettings = FALSE;
return;
}
- gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+ gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
nkx/(double)nnodes_major, pme_order);
}
}
snew(pme, 1);
- pme->redist_init = FALSE;
pme->sum_qgrid_tmp = NULL;
pme->sum_qgrid_dd_tmp = NULL;
pme->buf_nalloc = 0;
- pme->redist_buf_nalloc = 0;
pme->nnodes = 1;
pme->bPPnode = TRUE;
MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
if (pme->nnodes != nnodes_major*nnodes_minor)
{
- gmx_incons("PME node count mismatch");
+ gmx_incons("PME rank count mismatch");
}
}
else
{
if (pme->nnodes % nnodes_major != 0)
{
- gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+ gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
}
pme->ndecompdim = 2;
pme->nkz = ir->nkz;
pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
pme->pme_order = ir->pme_order;
+
+ /* Always constant electrostatics coefficients */
pme->epsilon_r = ir->epsilon_r;
+ /* Always constant LJ coefficients */
+ pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
/* If we violate restrictions, generate a fatal error here */
gmx_pme_check_restrictions(pme->pme_order,
pme->nkx, pme->nky, pme->nkz,
MPI_Type_commit(&(pme->rvec_mpi));
#endif
- /* Note that the charge spreading and force gathering, which usually
+ /* Note that the coefficient spreading and force gathering, which usually
* takes about the same amount of time as FFT+solve_pme,
* is always fully load balanced
- * (unless the charge distribution is inhomogeneous).
+ * (unless the coefficient distribution is inhomogeneous).
*/
imbal = pme_load_imbalance(pme);
"\n"
"NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
" For optimal PME load balancing\n"
- " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
- " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+ " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
"\n",
(int)((imbal-1)*100 + 0.5),
pme->nkx, pme->nky, pme->nnodes_major,
ndata[0] = pme->nkx;
ndata[1] = pme->nky;
ndata[2] = pme->nkz;
- pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
+ /* It doesn't matter if we allocate too many grids here,
+ * we only allocate and use the ones we need.
+ */
+ if (EVDW_PME(ir->vdwtype))
+ {
+ pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
+ }
+ else
+ {
+ pme->ngrids = DO_Q;
+ }
snew(pme->fftgrid, pme->ngrids);
snew(pme->cfftgrid, pme->ngrids);
snew(pme->pfft_setup, pme->ngrids);
for (i = 0; i < pme->ngrids; ++i)
{
- if (((ir->ljpme_combination_rule == eljpmeLB) && i >= 2) || i % 2 == 0 || bFreeEnergy_q || bFreeEnergy_lj)
+ if ((i < DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+ bFreeEnergy_q)) ||
+ (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+ bFreeEnergy_lj ||
+ ir->ljpme_combination_rule == eljpmeLB)))
{
pmegrids_init(&pme->pmegrid[i],
pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
}
-static void copy_local_grid(gmx_pme_t pme,
- pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+ int grid_index, int thread, real *fftgrid)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_my, fft_mz;
pmegrid_t *pmegrid;
real *grid_th;
- gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
static void
reduce_threadgrid_overlap(gmx_pme_t pme,
const pmegrids_t *pmegrids, int thread,
- real *fftgrid, real *commbuf_x, real *commbuf_y)
+ real *fftgrid, real *commbuf_x, real *commbuf_y,
+ int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_nx, fft_ny, fft_nz;
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
- ivec ne;
+ ivec localcopy_end;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
const real *grid_th;
real *commbuf = NULL;
- gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
for (d = 0; d < DIM; d++)
{
- ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
- local_fft_ndata[d]);
+ /* Determine up to where our thread needs to copy from the
+ * thread-local charge spreading grid to the rank-local FFT grid.
+ * This is up to our spreading grid end minus order-1 and
+ * not beyond the local FFT grid.
+ */
+ localcopy_end[d] =
+ min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
}
offx = pmegrid->offset[XX];
/* Now loop over all the thread data blocks that contribute
* to the grid region we (our thread) are operating on.
*/
- /* Note that ffy_nx/y is equal to the number of grid points
+ /* Note that fft_nx/y is equal to the number of grid points
* between the first point of our node grid and the one of the next node.
*/
for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
}
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
+ /* Determine the end of our part of the source grid */
if (!bCommX)
{
- tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ /* Use our thread local source grid and target grid part */
+ tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
+ /* Determine the end of our part of the source grid */
if (!bCommY)
{
- ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ /* Use our thread local source grid and target grid part */
+ ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fz];
oz += pmegrid_g->offset[ZZ];
- tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+ tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
if (sx == 0 && sy == 0 && sz == 0)
{
if (bCommY)
{
commbuf = commbuf_y;
- buf_my = ty1 - offy;
+ /* The y-size of the communication buffer is set by
+ * the overlap of the grid part of our local slab
+ * with the part starting at the next slab.
+ */
+ buf_my =
+ pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor+1];
if (bCommX)
{
/* We index commbuf modulo the local grid size */
}
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
pme_overlap_t *overlap;
int x, y, z, indg, indb;
/* Note that this routine is only used for forward communication.
- * Since the force gathering, unlike the charge spreading,
+ * Since the force gathering, unlike the coefficient spreading,
* can be trivially parallelized over the particles,
* the backwards process is much simpler and can use the "old"
* communication setup.
*/
- gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
recvptr = overlap->recvbuf;
+ if (debug != NULL)
+ {
+ fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+ local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+ }
+
#ifdef GMX_MPI
MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
send_id, ipulse,
if (debug != NULL)
{
- fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
}
static void spread_on_grid(gmx_pme_t pme,
pme_atomcomm_t *atc, pmegrids_t *grids,
gmx_bool bCalcSplines, gmx_bool bSpread,
- real *fftgrid, gmx_bool bDoSplines )
+ real *fftgrid, gmx_bool bDoSplines, int grid_index)
{
int nthread, thread;
#ifdef PME_TIME_THREADS
/* Compute fftgrid index for all atoms,
* with help of some extra variables.
*/
- calc_interpolation_idx(pme, atc, start, end, thread);
+ calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
}
}
#ifdef PME_TIME_THREADS
if (grids->nthread == 1)
{
- /* One thread, we operate on all charges */
+ /* One thread, we operate on all coefficients */
spline->n = atc->n;
}
else
if (bCalcSplines)
{
make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
- atc->fractx, spline->n, spline->ind, atc->q, bDoSplines);
+ atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
}
if (bSpread)
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_start();
#endif
- spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+ spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
if (pme->bUseThreads)
{
- copy_local_grid(pme, grids, thread, fftgrid);
+ copy_local_grid(pme, grids, grid_index, thread, fftgrid);
}
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_end(ct1a);
reduce_threadgrid_overlap(pme, grids, thread,
fftgrid,
pme->overlap[0].sendbuf,
- pme->overlap[1].sendbuf);
+ pme->overlap[1].sendbuf,
+ grid_index);
}
#ifdef PME_TIME_THREADS
c3 = omp_cyc_end(c3);
* For this communication call we need to check pme->bUseThreads
* to have all ranks communicate here, regardless of pme->nthread.
*/
- sum_fftgrid_dd(pme, fftgrid);
+ sum_fftgrid_dd(pme, fftgrid, grid_index);
}
}
{
gmx_incons("gmx_pme_calc_energy called in parallel");
}
- if (pme->bFEP > 1)
+ if (pme->bFEP_q > 1)
{
gmx_incons("gmx_pme_calc_energy with free energy");
}
atc->pme_order = pme->pme_order;
atc->n = n;
pme_realloc_atomcomm_things(atc);
- atc->x = x;
- atc->q = q;
+ atc->x = x;
+ atc->coefficient = q;
/* We only use the A-charges grid */
grid = &pme->pmegrid[PME_GRID_QA];
/* Only calculate the spline coefficients, don't actually spread */
- spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE);
+ spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
*V = gather_energy_bsplines(pme, grid->grid.grid, atc);
}
do
{
/* Domain decomposition */
- ret = gmx_pme_recv_params_coords(pme_pp,
+ ret = gmx_pme_recv_coeffs_coords(pme_pp,
&natoms,
&chargeA, &chargeB,
&c6A, &c6B,
{
real sigma4;
- sigma4 = local_sigma[i];
- sigma4 = sigma4*sigma4;
- sigma4 = sigma4*sigma4;
- pme->atc[0].q[i] = local_c6[i] / sigma4;
+ sigma4 = local_sigma[i];
+ sigma4 = sigma4*sigma4;
+ sigma4 = sigma4*sigma4;
+ pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
}
}
for (i = 0; i < pme->atc[0].n; ++i)
{
- pme->atc[0].q[i] *= local_sigma[i];
+ pme->atc[0].coefficient[i] *= local_sigma[i];
}
}
static void
-do_redist_x_q(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
- gmx_bool bFirst, rvec x[], real *data)
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+ gmx_bool bFirst, rvec x[], real *data)
{
int d;
pme_atomcomm_t *atc;
{
int n_d;
rvec *x_d;
- real *q_d;
+ real *param_d;
if (d == pme->ndecompdim - 1)
{
- n_d = homenr;
- x_d = x + start;
- q_d = data;
+ n_d = homenr;
+ x_d = x + start;
+ param_d = data;
}
else
{
- n_d = pme->atc[d + 1].n;
- x_d = atc->x;
- q_d = atc->q;
+ n_d = pme->atc[d + 1].n;
+ x_d = atc->x;
+ param_d = atc->coefficient;
}
atc = &pme->atc[d];
atc->npd = n_d;
/* Redistribute x (only once) and qA/c6A or qB/c6B */
if (DOMAINDECOMP(cr))
{
- dd_pmeredist_x_q(pme, n_d, bFirst, x_d, q_d, atc);
- }
- else
- {
- pmeredist_pd(pme, TRUE, n_d, bFirst, x_d, q_d, atc);
+ dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
}
}
}
-/* TODO: when adding free-energy support, sigmaB will no longer be
- * unused */
int gmx_pme_do(gmx_pme_t pme,
int start, int homenr,
rvec x[], rvec f[],
real *chargeA, real *chargeB,
real *c6A, real *c6B,
- real *sigmaA, real gmx_unused *sigmaB,
+ real *sigmaA, real *sigmaB,
matrix box, t_commrec *cr,
int maxshift_x, int maxshift_y,
t_nrnb *nrnb, gmx_wallcycle_t wcycle,
real *dvdlambda_q, real *dvdlambda_lj,
int flags)
{
- int q, d, i, j, ntot, npme, qmax;
+ int d, i, j, k, ntot, npme, grid_index, max_grid_index;
int nx, ny, nz;
int n_d, local_ny;
pme_atomcomm_t *atc = NULL;
real *grid = NULL;
real *ptr;
rvec *x_d, *f_d;
- real *charge = NULL, *q_d;
+ real *coefficient = NULL;
real energy_AB[4];
matrix vir_AB[4];
real scale, lambda;
t_complex * cfftgrid;
int thread;
gmx_bool bFirst, bDoSplines;
+ int fep_state;
+ int fep_states_lj = pme->bFEP_lj ? 2 : 1;
const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
/* We need a maximum of four separate PME calculations:
- * q=0: Coulomb PME with charges from state A
- * q=1: Coulomb PME with charges from state B
- * q=2: LJ PME with C6 from state A
- * q=3: LJ PME with C6 from state B
+ * grid_index=0: Coulomb PME with charges from state A
+ * grid_index=1: Coulomb PME with charges from state B
+ * grid_index=2: LJ PME with C6 from state A
+ * grid_index=3: LJ PME with C6 from state B
* For Lorentz-Berthelot combination rules, a separate loop is used to
* calculate all the terms
*/
/* If we are doing LJ-PME with LB, we only do Q here */
- qmax = (flags & GMX_PME_LJ_LB) ? DO_Q : DO_Q_AND_LJ;
+ max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
- for (q = 0; q < qmax; ++q)
+ for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
{
- /* Check if we should do calculations at this q
- * If q is odd we should be doing FEP
- * If q < 2 we should be doing electrostatic PME
- * If q >= 2 we should be doing LJ-PME
+ /* Check if we should do calculations at this grid_index
+ * If grid_index is odd we should be doing FEP
+ * If grid_index < 2 we should be doing electrostatic PME
+ * If grid_index >= 2 we should be doing LJ-PME
*/
- if ((!pme->bFEP_q && q == 1)
- || (!pme->bFEP_lj && q == 3)
- || (!(flags & GMX_PME_DO_COULOMB) && q < 2)
- || (!(flags & GMX_PME_DO_LJ) && q >= 2))
+ if ((grid_index < DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+ (grid_index == 1 && !pme->bFEP_q))) ||
+ (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+ (grid_index == 3 && !pme->bFEP_lj))))
{
continue;
}
/* Unpack structure */
- pmegrid = &pme->pmegrid[q];
- fftgrid = pme->fftgrid[q];
- cfftgrid = pme->cfftgrid[q];
- pfft_setup = pme->pfft_setup[q];
- switch (q)
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ switch (grid_index)
{
- case 0: charge = chargeA + start; break;
- case 1: charge = chargeB + start; break;
- case 2: charge = c6A + start; break;
- case 3: charge = c6B + start; break;
+ case 0: coefficient = chargeA + start; break;
+ case 1: coefficient = chargeB + start; break;
+ case 2: coefficient = c6A + start; break;
+ case 3: coefficient = c6B + start; break;
}
grid = pmegrid->grid.grid;
if (debug)
{
- fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+ fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
cr->nnodes, cr->nodeid);
fprintf(debug, "Grid = %p\n", (void*)grid);
if (grid == NULL)
if (pme->nnodes == 1)
{
- atc->q = charge;
+ atc->coefficient = coefficient;
}
else
{
wallcycle_start(wcycle, ewcPME_REDISTXF);
- do_redist_x_q(pme, cr, start, homenr, bFirst, x, charge);
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
where();
wallcycle_stop(wcycle, ewcPME_REDISTXF);
if (debug)
{
- fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+ fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
cr->nodeid, atc->n);
}
- if (flags & GMX_PME_SPREAD_Q)
+ if (flags & GMX_PME_SPREAD)
{
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
- /* Spread the charges on a grid */
- spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines);
+ /* Spread the coefficients on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
if (bFirst)
{
inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
}
- inc_nrnb(nrnb, eNR_SPREADQBSP,
+ inc_nrnb(nrnb, eNR_SPREADBSP,
pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
if (!pme->bUseThreads)
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
where();
}
#endif
- copy_pmegrid_to_fftgrid(pme, grid, fftgrid, q);
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
/* solve in k-space for our local cells */
if (thread == 0)
{
- wallcycle_start(wcycle, (q < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
}
- if (q < DO_Q)
+ if (grid_index < DO_Q)
{
loop_count =
solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
if (thread == 0)
{
- wallcycle_stop(wcycle, (q < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
where();
inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
}
inc_nrnb(nrnb, eNR_FFT, 2*npme);
}
+ /* Note: this wallcycle region is closed below
+ outside an OpenMP region, so take care if
+ refactoring code here. */
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
}
- copy_fftgrid_to_pmegrid(pme, fftgrid, grid, q, pme->nthread, thread);
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
}
}
/* End of thread parallel section.
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
}
#endif
where();
* atc->f is the actual force array, not a buffer,
* therefore we should not clear it.
*/
- lambda = q < DO_Q ? lambda_q : lambda_lj;
+ lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
bClearF = (bFirst && PAR(cr));
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
for (thread = 0; thread < pme->nthread; thread++)
{
gather_f_bsplines(pme, grid, bClearF, atc,
&atc->spline[thread],
- pme->bFEP ? (q % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
+ pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
}
where();
inc_nrnb(nrnb, eNR_GATHERFBSP,
pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ /* Note: this wallcycle region is opened above inside an OpenMP
+ region, so take care if refactoring code here. */
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
}
/* This should only be called on the master thread
* and after the threads have synchronized.
*/
- if (q < 2)
+ if (grid_index < 2)
{
- get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
}
else
{
- get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
}
}
bFirst = FALSE;
- } /* of q-loop */
+ } /* of grid_index-loop */
/* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
* seven terms. */
- if (flags & GMX_PME_LJ_LB)
+ if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
{
- real *local_c6 = NULL, *local_sigma = NULL;
- if (pme->nnodes == 1)
- {
- if (pme->lb_buf1 == NULL)
- {
- pme->lb_buf_nalloc = pme->atc[0].n;
- snew(pme->lb_buf1, pme->lb_buf_nalloc);
- }
- pme->atc[0].q = pme->lb_buf1;
- local_c6 = c6A;
- local_sigma = sigmaA;
- }
- else
+ /* Loop over A- and B-state if we are doing FEP */
+ for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
{
- atc = &pme->atc[0];
-
- wallcycle_start(wcycle, ewcPME_REDISTXF);
-
- do_redist_x_q(pme, cr, start, homenr, bFirst, x, c6A);
- if (pme->lb_buf_nalloc < atc->n)
+ real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+ if (pme->nnodes == 1)
{
- pme->lb_buf_nalloc = atc->nalloc;
- srenew(pme->lb_buf1, pme->lb_buf_nalloc);
- srenew(pme->lb_buf2, pme->lb_buf_nalloc);
- }
- local_c6 = pme->lb_buf1;
- for (i = 0; i < atc->n; ++i)
- {
- local_c6[i] = atc->q[i];
- }
- where();
-
- do_redist_x_q(pme, cr, start, homenr, FALSE, x, sigmaA);
- local_sigma = pme->lb_buf2;
- for (i = 0; i < atc->n; ++i)
- {
- local_sigma[i] = atc->q[i];
+ if (pme->lb_buf1 == NULL)
+ {
+ pme->lb_buf_nalloc = pme->atc[0].n;
+ snew(pme->lb_buf1, pme->lb_buf_nalloc);
+ }
+ pme->atc[0].coefficient = pme->lb_buf1;
+ switch (fep_state)
+ {
+ case 0:
+ local_c6 = c6A;
+ local_sigma = sigmaA;
+ break;
+ case 1:
+ local_c6 = c6B;
+ local_sigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
}
- where();
-
- wallcycle_stop(wcycle, ewcPME_REDISTXF);
- }
- calc_initial_lb_coeffs(pme, local_c6, local_sigma);
-
- /*Seven terms in LJ-PME with LB, q < 2 reserved for electrostatics*/
- for (q = 2; q < 9; ++q)
- {
- /* Unpack structure */
- pmegrid = &pme->pmegrid[q];
- fftgrid = pme->fftgrid[q];
- cfftgrid = pme->cfftgrid[q];
- pfft_setup = pme->pfft_setup[q];
- calc_next_lb_coeffs(pme, local_sigma);
- grid = pmegrid->grid.grid;
- where();
-
- if (flags & GMX_PME_SPREAD_Q)
+ else
{
- wallcycle_start(wcycle, ewcPME_SPREADGATHER);
- /* Spread the charges on a grid */
- spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines);
+ atc = &pme->atc[0];
+ switch (fep_state)
+ {
+ case 0:
+ RedistC6 = c6A;
+ RedistSigma = sigmaA;
+ break;
+ case 1:
+ RedistC6 = c6B;
+ RedistSigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
+ wallcycle_start(wcycle, ewcPME_REDISTXF);
- if (bFirst)
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+ if (pme->lb_buf_nalloc < atc->n)
{
- inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+ pme->lb_buf_nalloc = atc->nalloc;
+ srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+ srenew(pme->lb_buf2, pme->lb_buf_nalloc);
}
- inc_nrnb(nrnb, eNR_SPREADQBSP,
- pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
- if (pme->nthread == 1)
+ local_c6 = pme->lb_buf1;
+ for (i = 0; i < atc->n; ++i)
{
- wrap_periodic_pmegrid(pme, grid);
+ local_c6[i] = atc->coefficient[i];
+ }
+ where();
- /* sum contributions to local grid from other nodes */
-#ifdef GMX_MPI
- if (pme->nnodes > 1)
- {
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
- where();
- }
-#endif
- copy_pmegrid_to_fftgrid(pme, grid, fftgrid, q);
+ do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+ local_sigma = pme->lb_buf2;
+ for (i = 0; i < atc->n; ++i)
+ {
+ local_sigma[i] = atc->coefficient[i];
}
+ where();
- wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ wallcycle_stop(wcycle, ewcPME_REDISTXF);
}
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
- /*Here we start a large thread parallel region*/
-#pragma omp parallel num_threads(pme->nthread) private(thread)
+ /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+ for (grid_index = 2; grid_index < 9; ++grid_index)
{
- thread = gmx_omp_get_thread_num();
- if (flags & GMX_PME_SOLVE)
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ calc_next_lb_coeffs(pme, local_sigma);
+ grid = pmegrid->grid.grid;
+ where();
+
+ if (flags & GMX_PME_SPREAD)
{
- /* do 3d-fft */
- if (thread == 0)
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ /* Spread the c6 on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+ if (bFirst)
{
- wallcycle_start(wcycle, ewcPME_FFT);
+ inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
}
- gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
- thread, wcycle);
- if (thread == 0)
+ inc_nrnb(nrnb, eNR_SPREADBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+ if (pme->nthread == 1)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
+ wrap_periodic_pmegrid(pme, grid);
+ /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+ where();
+ }
+#endif
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
- where();
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
}
- }
- bFirst = FALSE;
- }
- if (flags & GMX_PME_SOLVE)
- {
- int loop_count;
- /* solve in k-space for our local cells */
+ /*Here we start a large thread parallel region*/
#pragma omp parallel num_threads(pme->nthread) private(thread)
- {
- thread = gmx_omp_get_thread_num();
- if (thread == 0)
{
- wallcycle_start(wcycle, ewcLJPME);
- }
+ thread = gmx_omp_get_thread_num();
+ if (flags & GMX_PME_SOLVE)
+ {
+ /* do 3d-fft */
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
- loop_count =
- solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
- box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
- bCalcEnerVir,
- pme->nthread, thread);
- if (thread == 0)
- {
- wallcycle_stop(wcycle, ewcLJPME);
- where();
- inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+ }
+ where();
+ }
}
+ bFirst = FALSE;
}
- }
-
- if (bCalcEnerVir)
- {
- /* This should only be called on the master thread and
- * after the threads have synchronized.
- */
- get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2], vir_AB[2]);
- }
-
- if (bCalcF)
- {
- bFirst = !(flags & GMX_PME_DO_COULOMB);
- calc_initial_lb_coeffs(pme, local_c6, local_sigma);
- for (q = 8; q >= 2; --q)
+ if (flags & GMX_PME_SOLVE)
{
- /* Unpack structure */
- pmegrid = &pme->pmegrid[q];
- fftgrid = pme->fftgrid[q];
- cfftgrid = pme->cfftgrid[q];
- pfft_setup = pme->pfft_setup[q];
- grid = pmegrid->grid.grid;
- calc_next_lb_coeffs(pme, local_sigma);
- where();
+ /* solve in k-space for our local cells */
#pragma omp parallel num_threads(pme->nthread) private(thread)
{
+ int loop_count;
thread = gmx_omp_get_thread_num();
- /* do 3d-invfft */
if (thread == 0)
{
- where();
- wallcycle_start(wcycle, ewcPME_FFT);
+ wallcycle_start(wcycle, ewcLJPME);
}
- gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
- thread, wcycle);
+ loop_count =
+ solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
-
+ wallcycle_stop(wcycle, ewcLJPME);
where();
+ inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ }
+ }
+ }
- if (pme->nodeid == 0)
+ if (bCalcEnerVir)
+ {
+ /* This should only be called on the master thread and
+ * after the threads have synchronized.
+ */
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+ }
+
+ if (bCalcF)
+ {
+ bFirst = !(flags & GMX_PME_DO_COULOMB);
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+ for (grid_index = 8; grid_index >= 2; --grid_index)
+ {
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ grid = pmegrid->grid.grid;
+ calc_next_lb_coeffs(pme, local_sigma);
+ where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ /* do 3d-invfft */
+ if (thread == 0)
{
- ntot = pme->nkx*pme->nky*pme->nkz;
- npme = ntot*log((real)ntot)/log(2.0);
- inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ where();
+ wallcycle_start(wcycle, ewcPME_FFT);
}
- wallcycle_start(wcycle, ewcPME_SPREADGATHER);
- }
- copy_fftgrid_to_pmegrid(pme, fftgrid, grid, q, pme->nthread, thread);
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+
+ where();
- } /*#pragma omp parallel*/
+ if (pme->nodeid == 0)
+ {
+ ntot = pme->nkx*pme->nky*pme->nkz;
+ npme = ntot*log((real)ntot)/log(2.0);
+ inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ }
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ }
- /* distribute local grid to all nodes */
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+ } /*#pragma omp parallel*/
+
+ /* distribute local grid to all nodes */
#ifdef GMX_MPI
- if (pme->nnodes > 1)
- {
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
- }
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+ }
#endif
- where();
+ where();
- unwrap_periodic_pmegrid(pme, grid);
+ unwrap_periodic_pmegrid(pme, grid);
- /* interpolate forces for our local atoms */
- where();
- bClearF = (bFirst && PAR(cr));
- scale = pme->bFEP ? (q < 9 ? 1.0-lambda_lj : lambda_lj) : 1.0;
- scale *= lb_scale_factor[q-2];
+ /* interpolate forces for our local atoms */
+ where();
+ bClearF = (bFirst && PAR(cr));
+ scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+ scale *= lb_scale_factor[grid_index-2];
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
- for (thread = 0; thread < pme->nthread; thread++)
- {
- gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
- &pme->atc[0].spline[thread],
- scale);
- }
- where();
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+ &pme->atc[0].spline[thread],
+ scale);
+ }
+ where();
- inc_nrnb(nrnb, eNR_GATHERFBSP,
- pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
- wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ inc_nrnb(nrnb, eNR_GATHERFBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
- bFirst = FALSE;
- } /*for (q = 8; q >= 2; --q)*/
- } /*if (bCalcF)*/
- } /*if (flags & GMX_PME_LJ_LB)*/
+ bFirst = FALSE;
+ } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+ } /* if (bCalcF) */
+ } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+ } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
if (bCalcF && pme->nnodes > 1)
{
dd_pmeredist_f(pme, atc, n_d, f_d,
d == pme->ndecompdim-1 && pme->bPPnode);
}
- else
- {
- pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
- }
}
wallcycle_stop(wcycle, ewcPME_REDISTXF);