-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
/* IMPORTANT FOR DEVELOPERS:
*
* /Erik 001109
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
-#ifdef GMX_LIB_MPI
-#include <mpi.h>
-#endif
-#ifdef GMX_THREAD_MPI
-#include "tmpi.h"
-#endif
+#include "gromacs/legacyheaders/pme.h"
+
+#include "config.h"
+#include <assert.h>
+#include <math.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "gmxcomplex.h"
-#include "smalloc.h"
-#include "futil.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "copyrite.h"
-#include "gmx_wallcycle.h"
-#include "gmx_parallel_3dfft.h"
-#include "pdbio.h"
-#include "gmx_cyclecounter.h"
-#include "gmx_omp.h"
-#include "macros.h"
-
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-
-#include "gmx_x86_simd_single.h"
-
-#define PME_SSE
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define PME_SSE_UNALIGNED
+
+#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/timing/cyclecounter.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
+
+/* Include the SIMD macro file and then check for support */
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
+/* Turn on arbitrary width SIMD intrinsics for PME solve */
+# define PME_SIMD_SOLVE
#endif
+
+#define PME_GRID_QA 0 /* Gridindex for A-state for Q */
+#define PME_GRID_C6A 2 /* Gridindex for A-state for LJ */
+#define DO_Q 2 /* Electrostatic grids have index q<2 */
+#define DO_Q_AND_LJ 4 /* non-LB LJ grids have index 2 <= q < 4 */
+#define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
+
+/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
+const real lb_scale_factor[] = {
+ 1.0/64, 6.0/64, 15.0/64, 20.0/64,
+ 15.0/64, 6.0/64, 1.0/64
+};
+
+/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
+const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
+
+/* Check if we have 4-wide SIMD macro support */
+#if (defined GMX_SIMD4_HAVE_REAL)
+/* Do PME spread and gather with 4-wide SIMD.
+ * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
+ */
+# define PME_SIMD4_SPREAD_GATHER
+
+# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
+/* With PME-order=4 on x86, unaligned load+store is slightly faster
+ * than doubling all SIMD operations when using aligned load+store.
+ */
+# define PME_SIMD4_UNALIGNED
+# endif
#endif
#define DFT_TOL 1e-7
#define mpi_type MPI_FLOAT
#endif
-/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#ifdef PME_SIMD4_SPREAD_GATHER
+# define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
+#else
+/* We can use any alignment, apart from 0, so we use 4 reals */
+# define SIMD4_ALIGNMENT (4*sizeof(real))
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
+ * to preserve alignment.
+ */
#define GMX_CACHE_SEP 64
/* We only define a maximum to be able to use local arrays without allocation.
int n;
int nalloc;
rvec *x;
- real *q;
+ real *coefficient;
rvec *f;
gmx_bool bSpread; /* These coordinates are used for spreading */
int pme_order;
ivec *idx;
- rvec *fractx; /* Fractional coordinate relative to the
- * lower cell boundary
+ rvec *fractx; /* Fractional coordinate relative to
+ * the lower cell boundary
*/
int nthread;
- int *thread_idx; /* Which thread should spread which charge */
+ int *thread_idx; /* Which thread should spread which coefficient */
thread_plist_t *thread_plist;
splinedata_t *spline;
} pme_atomcomm_t;
ivec nthread_comm; /* The number of threads to communicate with */
} pmegrids_t;
-
typedef struct {
-#ifdef PME_SSE
- /* Masks for SSE aligned spreading and gathering */
- __m128 mask_SSE0[6], mask_SSE1[6];
+#ifdef PME_SIMD4_SPREAD_GATHER
+ /* Masks for 4-wide SIMD aligned spreading and gathering */
+ gmx_simd4_bool_t mask_S0[6], mask_S1[6];
#else
- int dummy; /* C89 requires that struct has at least one member */
+ int dummy; /* C89 requires that struct has at least one member */
#endif
} pme_spline_work_t;
real * denom;
real * tmp1_alloc;
real * tmp1;
+ real * tmp2;
real * eterm;
real * m2inv;
- real energy;
- matrix vir;
+ real energy_q;
+ matrix vir_q;
+ real energy_lj;
+ matrix vir_lj;
} pme_work_t;
typedef struct gmx_pme {
MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
#endif
- int nthread; /* The number of threads doing PME */
+ gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
+ int nthread; /* The number of threads doing PME on our rank */
gmx_bool bPPnode; /* Node also does particle-particle forces */
gmx_bool bFEP; /* Compute Free energy contribution */
+ gmx_bool bFEP_q;
+ gmx_bool bFEP_lj;
int nkx, nky, nkz; /* Grid dimensions */
gmx_bool bP3M; /* Do P3M: optimize the influence function */
int pme_order;
real epsilon_r;
- pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
- pmegrids_t pmegridB;
- /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+ int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
+
+ int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
+
+ pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
+ * includes overlap Grid indices are ordered as
+ * follows:
+ * 0: Coloumb PME, state A
+ * 1: Coloumb PME, state B
+ * 2-8: LJ-PME
+ * This can probably be done in a better way
+ * but this simple hack works for now
+ */
+ /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
int pmegrid_nx, pmegrid_ny, pmegrid_nz;
/* pmegrid_nz might be larger than strictly necessary to ensure
* memory alignment, pmegrid_nz_base gives the real base size.
int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
/* Work data for spreading and gathering */
- pme_spline_work_t *spline_work;
+ pme_spline_work_t *spline_work;
- real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
- real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
- int fftgrid_nx, fftgrid_ny, fftgrid_nz;
+ real **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+ /* inside the interpolation grid, but separate for 2D PME decomp. */
+ int fftgrid_nx, fftgrid_ny, fftgrid_nz;
- t_complex *cfftgridA; /* Grids for complex FFT data */
- t_complex *cfftgridB;
- int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+ t_complex **cfftgrid; /* Grids for complex FFT data */
- gmx_parallel_3dfft_t pfft_setupA;
- gmx_parallel_3dfft_t pfft_setupB;
+ int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
- int *nnx, *nny, *nnz;
- real *fshx, *fshy, *fshz;
+ gmx_parallel_3dfft_t *pfft_setup;
- pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
- matrix recipbox;
- splinevec bsp_mod;
+ int *nnx, *nny, *nnz;
+ real *fshx, *fshy, *fshz;
- pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+ pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
+ matrix recipbox;
+ splinevec bsp_mod;
+ /* Buffers to store data for local atoms for L-B combination rule
+ * calculations in LJ-PME. lb_buf1 stores either the coefficients
+ * for spreading/gathering (in serial), or the C6 coefficient for
+ * local atoms (in parallel). lb_buf2 is only used in parallel,
+ * and stores the sigma values for local atoms. */
+ real *lb_buf1, *lb_buf2;
+ int lb_buf_nalloc; /* Allocation size for the above buffers. */
- pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
+ pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
- rvec *bufv; /* Communication buffer */
- real *bufr; /* Communication buffer */
- int buf_nalloc; /* The communication buffer size */
+ pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
+
+ rvec *bufv; /* Communication buffer */
+ real *bufr; /* Communication buffer */
+ int buf_nalloc; /* The communication buffer size */
/* thread local work data for solve_pme */
pme_work_t *work;
- /* Work data for PME_redist */
- gmx_bool redist_init;
- int * scounts;
- int * rcounts;
- int * sdispls;
- int * rdispls;
- int * sidx;
- int * idxa;
- real * redist_buf;
- int redist_buf_nalloc;
-
/* Work data for sum_qgrid */
real * sum_qgrid_tmp;
real * sum_qgrid_dd_tmp;
} t_gmx_pme;
-
static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
- int start, int end, int thread)
+ int start, int grid_index, int end, int thread)
{
int i;
int *idxptr, tix, tiy, tiz;
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
- g2tx = pme->pmegridA.g2t[XX];
- g2ty = pme->pmegridA.g2t[YY];
- g2tz = pme->pmegridA.g2t[ZZ];
+ g2tx = pme->pmegrid[grid_index].g2t[XX];
+ g2ty = pme->pmegrid[grid_index].g2t[YY];
+ g2tz = pme->pmegrid[grid_index].g2t[ZZ];
bThreads = (atc->nthread > 1);
if (bThreads)
srenew(th[XX], nalloc);
srenew(th[YY], nalloc);
- /* In z we add padding, this is only required for the aligned SSE code */
- srenew(*ptr_z, nalloc+2*padding);
+ /* In z we add padding, this is only required for the aligned SIMD code */
+ sfree_aligned(*ptr_z);
+ snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
th[ZZ] = *ptr_z + padding;
for (i = 0; i < padding; i++)
if (atc->nslab > 1)
{
srenew(atc->x, atc->nalloc);
- srenew(atc->q, atc->nalloc);
+ srenew(atc->coefficient, atc->nalloc);
srenew(atc->f, atc->nalloc);
for (i = nalloc_old; i < atc->nalloc; i++)
{
}
}
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
- int n, gmx_bool bXF, rvec *x_f, real *charge,
- pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate */
-{
- int *idxa;
- int i, ii;
-
- if (FALSE == pme->redist_init)
- {
- snew(pme->scounts, atc->nslab);
- snew(pme->rcounts, atc->nslab);
- snew(pme->sdispls, atc->nslab);
- snew(pme->rdispls, atc->nslab);
- snew(pme->sidx, atc->nslab);
- pme->redist_init = TRUE;
- }
- if (n > pme->redist_buf_nalloc)
- {
- pme->redist_buf_nalloc = over_alloc_dd(n);
- srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
- }
-
- pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
- if (forw && bXF)
- {
- /* forward, redistribution from pp to pme */
-
- /* Calculate send counts and exchange them with other nodes */
- for (i = 0; (i < atc->nslab); i++)
- {
- pme->scounts[i] = 0;
- }
- for (i = 0; (i < n); i++)
- {
- pme->scounts[pme->idxa[i]]++;
- }
- MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
- /* Calculate send and receive displacements and index into send
- buffer */
- pme->sdispls[0] = 0;
- pme->rdispls[0] = 0;
- pme->sidx[0] = 0;
- for (i = 1; i < atc->nslab; i++)
- {
- pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
- pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
- pme->sidx[i] = pme->sdispls[i];
- }
- /* Total # of particles to be received */
- atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
- pme_realloc_atomcomm_things(atc);
-
- /* Copy particle coordinates into send buffer and exchange*/
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii+XX] = x_f[i][XX];
- pme->redist_buf[ii+YY] = x_f[i][YY];
- pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
- pme->rvec_mpi, atc->mpi_comm);
- }
- if (forw)
- {
- /* Copy charge into send buffer and exchange*/
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii] = charge[i];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
- atc->q, pme->rcounts, pme->rdispls, mpi_type,
- atc->mpi_comm);
- }
- else /* backward, redistribution from pme to pp */
- {
- MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
- pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->mpi_comm);
-
- /* Copy data from receive buffer */
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- x_f[i][XX] += pme->redist_buf[ii+XX];
- x_f[i][YY] += pme->redist_buf[ii+YY];
- x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
- pme->sidx[pme->idxa[i]]++;
- }
- }
-#endif
-}
-
-static void pme_dd_sendrecv(pme_atomcomm_t *atc,
- gmx_bool bBackward, int shift,
- void *buf_s, int nbyte_s,
- void *buf_r, int nbyte_r)
+static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
+ gmx_bool gmx_unused bBackward, int gmx_unused shift,
+ void gmx_unused *buf_s, int gmx_unused nbyte_s,
+ void gmx_unused *buf_r, int gmx_unused nbyte_r)
{
#ifdef GMX_MPI
int dest, src;
#endif
}
-static void dd_pmeredist_x_q(gmx_pme_t pme,
- int n, gmx_bool bX, rvec *x, real *charge,
- pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+ int n, gmx_bool bX, rvec *x, real *data,
+ pme_atomcomm_t *atc)
{
int *commnode, *buf_index;
int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
{
if (atc->count[atc->nodeid] + nsend != n)
{
- gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+ gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
"This usually means that your system is not well equilibrated.",
n - (atc->count[atc->nodeid] + nsend),
pme->nodeid, 'x'+atc->dimind);
/* Communicate the count */
if (debug)
{
- fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+ fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
atc->dimind, atc->nodeid, commnode[i], scount);
}
pme_dd_sendrecv(atc, FALSE, i,
{
copy_rvec(x[i], atc->x[local_pos]);
}
- atc->q[local_pos] = charge[i];
+ atc->coefficient[local_pos] = data[i];
local_pos++;
}
else
{
copy_rvec(x[i], pme->bufv[buf_index[node]]);
}
- pme->bufr[buf_index[node]] = charge[i];
+ pme->bufr[buf_index[node]] = data[i];
buf_index[node]++;
}
}
pme->bufv[buf_pos], scount*sizeof(rvec),
atc->x[local_pos], rcount*sizeof(rvec));
}
- /* Communicate the charges */
+ /* Communicate the coefficients */
pme_dd_sendrecv(atc, FALSE, i,
pme->bufr+buf_pos, scount*sizeof(real),
- atc->q+local_pos, rcount*sizeof(real));
+ atc->coefficient+local_pos, rcount*sizeof(real));
buf_pos += scount;
local_pos += atc->rcount[i];
}
}
#ifdef GMX_MPI
-static void
-gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
{
pme_overlap_t *overlap;
int send_index0, send_nindex;
/* Since we have already (un)wrapped the overlap in the z-dimension,
* we only have to communicate 0 to nkz (not pmegrid_nz).
*/
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
/* Copy data to contiguous send buffer */
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_iy,
send_index0-pme->pmegrid_start_iy,
/* Get data from contiguous recv buffer */
if (debug)
{
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_iy,
recv_index0-pme->pmegrid_start_iy,
for (k = 0; k < pme->nkz; k++)
{
iz = k;
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
}
for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
{
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix+send_nindex);
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_ix,
recv_index0-pme->pmegrid_start_ix,
overlap->mpi_comm, &stat);
/* ADD data from contiguous recv buffer */
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
for (i = 0; i < recv_nindex*datasize; i++)
#endif
-static int
-copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
ivec local_pme_size;
int pmeidx, fftidx;
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
{
#ifdef DEBUG_PME
FILE *fp, *fp2;
- char fn[STRLEN], format[STRLEN];
+ char fn[STRLEN];
real val;
sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
- fp = ffopen(fn, "w");
+ fp = gmx_ffopen(fn, "w");
sprintf(fn, "pmegrid%d.txt", pme->nodeid);
- fp2 = ffopen(fn, "w");
- sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+ fp2 = gmx_ffopen(fn, "w");
#endif
for (ix = 0; ix < local_fft_ndata[XX]; ix++)
val = 100*pmegrid[pmeidx];
if (pmegrid[pmeidx] != 0)
{
- fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
- 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+ gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+ 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
}
if (pmegrid[pmeidx] != 0)
{
}
}
#ifdef DEBUG_PME
- ffclose(fp);
- ffclose(fp2);
+ gmx_ffclose(fp);
+ gmx_ffclose(fp2);
#endif
}
return 0;
}
-static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
- int nthread, int thread)
+static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
+ int nthread, int thread)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
ivec local_pme_size;
c1 = omp_cyc_start();
#endif
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
}
-static void
-wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
{
int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
}
-static void
-unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
{
int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
}
}
-static void clear_grid(int nx, int ny, int nz, real *grid,
- ivec fs, int *flag,
- int fx, int fy, int fz,
- int order)
-{
- int nc, ncz;
- int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
- int flind;
-
- nc = 2 + (order - 2)/FLBS;
- ncz = 2 + (order - 2)/FLBSZ;
-
- for (fsx = fx; fsx < fx+nc; fsx++)
- {
- for (fsy = fy; fsy < fy+nc; fsy++)
- {
- for (fsz = fz; fsz < fz+ncz; fsz++)
- {
- flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
- if (flag[flind] == 0)
- {
- gx = fsx*FLBS;
- gy = fsy*FLBS;
- gz = fsz*FLBSZ;
- g0x = (gx*ny + gy)*nz + gz;
- for (x = 0; x < FLBS; x++)
- {
- g0y = g0x;
- for (y = 0; y < FLBS; y++)
- {
- for (z = 0; z < FLBSZ; z++)
- {
- grid[g0y+z] = 0;
- }
- g0y += nz;
- }
- g0x += ny*nz;
- }
-
- flag[flind] = 1;
- }
- }
- }
- }
-}
/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
#define DO_BSPLINE(order) \
for (ithx = 0; (ithx < order); ithx++) \
{ \
index_x = (i0+ithx)*pny*pnz; \
- valx = qn*thx[ithx]; \
+ valx = coefficient*thx[ithx]; \
\
for (ithy = 0; (ithy < order); ithy++) \
{ \
}
-static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
- pme_atomcomm_t *atc, splinedata_t *spline,
- pme_spline_work_t *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
+ pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ pme_spline_work_t gmx_unused *work)
{
- /* spread charges from home atoms to local grid */
+ /* spread coefficients from home atoms to local grid */
real *grid;
pme_overlap_t *ol;
int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
int * idxptr;
int order, norder, index_x, index_xy, index_xyz;
- real valx, valxy, qn;
+ real valx, valxy, coefficient;
real *thx, *thy, *thz;
int localsize, bndsize;
int pnx, pny, pnz, ndatatot;
int offx, offy, offz;
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
+#endif
+
pnx = pmegrid->s[XX];
pny = pmegrid->s[YY];
pnz = pmegrid->s[ZZ];
for (nn = 0; nn < spline->n; nn++)
{
- n = spline->ind[nn];
- qn = atc->q[n];
+ n = spline->ind[nn];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = nn*order;
switch (order)
{
case 4:
-#ifdef PME_SSE
-#ifdef PME_SSE_UNALIGNED
-#define PME_SPREAD_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_SPREAD_SIMD4_ORDER4
#else
-#define PME_SPREAD_SSE_ALIGNED
+#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(4);
#endif
break;
case 5:
-#ifdef PME_SSE
-#define PME_SPREAD_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(5);
#endif
}
}
-static void set_grid_alignment(int *pmegrid_nz, int pme_order)
+static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
{
-#ifdef PME_SSE
+#ifdef PME_SIMD4_SPREAD_GATHER
if (pme_order == 5
-#ifndef PME_SSE_UNALIGNED
+#ifndef PME_SIMD4_UNALIGNED
|| pme_order == 4
#endif
)
#endif
}
-static void set_gridsize_alignment(int *gridsize, int pme_order)
+static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
{
-#ifdef PME_SSE
-#ifndef PME_SSE_UNALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifndef PME_SIMD4_UNALIGNED
if (pme_order == 4)
{
/* Add extra elements to ensured aligned operations do not go
{
gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
set_gridsize_alignment(&gridsize, pme_order);
- snew_aligned(grid->grid, gridsize, 16);
+ snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
}
else
{
static void pmegrids_init(pmegrids_t *grids,
int nx, int ny, int nz, int nz_base,
int pme_order,
+ gmx_bool bUseThreads,
int nthread,
int overlap_x,
int overlap_y)
make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
- if (grids->nthread > 1)
+ if (bUseThreads)
{
ivec nst;
int gridsize;
set_gridsize_alignment(&gridsize, pme_order);
snew_aligned(grids->grid_all,
grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
- 16);
+ SIMD4_ALIGNMENT);
for (x = 0; x < grids->nc[XX]; x++)
{
}
}
}
+ else
+ {
+ grids->grid_th = NULL;
+ }
snew(grids->g2t, DIM);
tfac = 1;
static void realloc_work(pme_work_t *work, int nkx)
{
+ int simd_width;
+
if (nkx > work->nalloc)
{
work->nalloc = nkx;
srenew(work->mhy, work->nalloc);
srenew(work->mhz, work->nalloc);
srenew(work->m2, work->nalloc);
- /* Allocate an aligned pointer for SSE operations, including 3 extra
- * elements at the end since SSE operates on 4 elements at a time.
+ /* Allocate an aligned pointer for SIMD operations, including extra
+ * elements at the end for padding.
*/
+#ifdef PME_SIMD_SOLVE
+ simd_width = GMX_SIMD_REAL_WIDTH;
+#else
+ /* We can use any alignment, apart from 0, so we use 4 */
+ simd_width = 4;
+#endif
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
+ sfree_aligned(work->tmp2);
sfree_aligned(work->eterm);
- snew_aligned(work->denom, work->nalloc+3, 16);
- snew_aligned(work->tmp1, work->nalloc+3, 16);
- snew_aligned(work->eterm, work->nalloc+3, 16);
+ snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
srenew(work->m2inv, work->nalloc);
}
}
sfree(work->m2);
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
+ sfree_aligned(work->tmp2);
sfree_aligned(work->eterm);
sfree(work->m2inv);
}
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
-inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
- __m128 f_sse;
- __m128 lu;
- __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+ const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+ gmx_simd_real_t f_simd;
+ gmx_simd_real_t lu;
+ gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_sse = _mm_load1_ps(&f);
- for (kx = 0; kx < end; kx += 4)
+ f_simd = gmx_simd_set1_r(f);
+ /* We only need to calculate from start. But since start is 0 or 1
+ * and we want to use aligned loads/stores, we always start from 0.
+ */
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
{
- tmp_d1 = _mm_load_ps(d_aligned+kx);
- lu = _mm_rcp_ps(tmp_d1);
- d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
- tmp_r = _mm_load_ps(r_aligned+kx);
- tmp_r = gmx_mm_exp_ps(tmp_r);
- tmp_e = _mm_mul_ps(f_sse, d_inv);
- tmp_e = _mm_mul_ps(tmp_e, tmp_r);
- _mm_store_ps(e_aligned+kx, tmp_e);
+ tmp_d1 = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d1);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ tmp_e = gmx_simd_mul_r(f_simd, d_inv);
+ tmp_e = gmx_simd_mul_r(tmp_e, tmp_r);
+ gmx_simd_store_r(e_aligned+kx, tmp_e);
}
}
}
#else
-inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
{
int kx;
for (kx = start; kx < end; kx++)
}
#endif
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+{
+ gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+ const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
+ int kx;
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
+ {
+ /* We only need to calculate from start. But since start is 0 or 1
+ * and we want to use aligned loads/stores, we always start from 0.
+ */
+ tmp_d = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d);
+ gmx_simd_store_r(d_aligned+kx, d_inv);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ gmx_simd_store_r(r_aligned+kx, tmp_r);
+ tmp_mk = gmx_simd_load_r(factor_aligned+kx);
+ tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+ gmx_simd_store_r(factor_aligned+kx, tmp_fac);
+ }
+}
+#else
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+{
+ int kx;
+ real mk;
+ for (kx = start; kx < end; kx++)
+ {
+ d[kx] = 1.0/d[kx];
+ }
+
+ for (kx = start; kx < end; kx++)
+ {
+ r[kx] = exp(r[kx]);
+ }
+
+ for (kx = start; kx < end; kx++)
+ {
+ mk = tmp2[kx];
+ tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
+ }
+}
+#endif
static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
real ewaldcoeff, real vol,
nz = pme->nkz;
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
complex_order,
local_ndata,
local_offset,
p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
/* We should skip the k-space point (0,0,0) */
+ /* Note that since here x is the minor index, local_offset[XX]=0 */
if (local_offset[XX] > 0 || ky > 0 || kz > 0)
{
kxstart = local_offset[XX];
m2inv[kx] = 1.0/m2[kx];
}
- calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
for (kx = kxstart; kx < kxend; kx++, p0++)
{
tmp1[kx] = -factor*m2k;
}
- calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
for (kx = kxstart; kx < kxend; kx++, p0++)
{
* experiencing problems on semiisotropic membranes.
* IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
*/
- work->vir[XX][XX] = 0.25*virxx;
- work->vir[YY][YY] = 0.25*viryy;
- work->vir[ZZ][ZZ] = 0.25*virzz;
- work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
- work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
- work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+ work->vir_q[XX][XX] = 0.25*virxx;
+ work->vir_q[YY][YY] = 0.25*viryy;
+ work->vir_q[ZZ][ZZ] = 0.25*virzz;
+ work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
+ work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
+ work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
/* This energy should be corrected for a charged system */
- work->energy = 0.5*energy;
+ work->energy_q = 0.5*energy;
}
/* Return the loop count */
return local_ndata[YY]*local_ndata[XX];
}
-static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
- real *mesh_energy, matrix vir)
+static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
+ real ewaldcoeff, real vol,
+ gmx_bool bEnerVir, int nthread, int thread)
{
- /* This function sums output over threads
- * and should therefore only be called after thread synchronization.
- */
- int thread;
-
- *mesh_energy = pme->work[0].energy;
- copy_mat(pme->work[0].vir, vir);
-
- for (thread = 1; thread < nthread; thread++)
- {
- *mesh_energy += pme->work[thread].energy;
- m_add(vir, pme->work[thread].vir, vir);
- }
-}
-
-#define DO_FSPLINE(order) \
- for (ithx = 0; (ithx < order); ithx++) \
- { \
- index_x = (i0+ithx)*pny*pnz; \
- tx = thx[ithx]; \
- dx = dthx[ithx]; \
- \
- for (ithy = 0; (ithy < order); ithy++) \
- { \
- index_xy = index_x+(j0+ithy)*pnz; \
- ty = thy[ithy]; \
- dy = dthy[ithy]; \
- fxy1 = fz1 = 0; \
- \
- for (ithz = 0; (ithz < order); ithz++) \
- { \
- gval = grid[index_xy+(k0+ithz)]; \
- fxy1 += thz[ithz]*gval; \
- fz1 += dthz[ithz]*gval; \
- } \
- fx += dx*ty*fxy1; \
- fy += tx*dy*fxy1; \
- fz += tx*ty*fz1; \
- } \
- }
-
-
-static void gather_f_bsplines(gmx_pme_t pme, real *grid,
- gmx_bool bClearF, pme_atomcomm_t *atc,
- splinedata_t *spline,
- real scale)
-{
- /* sum forces for local particles */
- int nn, n, ithx, ithy, ithz, i0, j0, k0;
- int index_x, index_xy;
- int nx, ny, nz, pnx, pny, pnz;
- int * idxptr;
- real tx, ty, dx, dy, qn;
- real fx, fy, fz, gval;
- real fxy1, fz1;
- real *thx, *thy, *thz, *dthx, *dthy, *dthz;
- int norder;
+ /* do recip sum over local cells in grid */
+ /* y major, z middle, x minor or continuous */
+ int ig, gcount;
+ int kx, ky, kz, maxkx, maxky, maxkz;
+ int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
+ real mx, my, mz;
+ real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+ real ets2, ets2vf;
+ real eterm, vterm, d1, d2, energy = 0;
+ real by, bz;
+ real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
real rxx, ryx, ryy, rzx, rzy, rzz;
- int order;
+ real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
+ real mhxk, mhyk, mhzk, m2k;
+ real mk;
+ pme_work_t *work;
+ real corner_fac;
+ ivec complex_order;
+ ivec local_ndata, local_offset, local_size;
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
- pme_spline_work_t *work;
+ /* Dimensions should be identical for A/B grid, so we just use A here */
+ gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
+ complex_order,
+ local_ndata,
+ local_offset,
+ local_size);
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
- work = pme->spline_work;
+ maxkx = (nx+1)/2;
+ maxky = (ny+1)/2;
+ maxkz = nz/2+1;
- order = pme->pme_order;
- thx = spline->theta[XX];
- thy = spline->theta[YY];
- thz = spline->theta[ZZ];
- dthx = spline->dtheta[XX];
- dthy = spline->dtheta[YY];
- dthz = spline->dtheta[ZZ];
- nx = pme->nkx;
- ny = pme->nky;
- nz = pme->nkz;
- pnx = pme->pmegrid_nx;
- pny = pme->pmegrid_ny;
- pnz = pme->pmegrid_nz;
+ work = &pme->work[thread];
+ mhx = work->mhx;
+ mhy = work->mhy;
+ mhz = work->mhz;
+ m2 = work->m2;
+ denom = work->denom;
+ tmp1 = work->tmp1;
+ tmp2 = work->tmp2;
- rxx = pme->recipbox[XX][XX];
- ryx = pme->recipbox[YY][XX];
- ryy = pme->recipbox[YY][YY];
- rzx = pme->recipbox[ZZ][XX];
- rzy = pme->recipbox[ZZ][YY];
- rzz = pme->recipbox[ZZ][ZZ];
+ iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
+ iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
- for (nn = 0; nn < spline->n; nn++)
+ for (iyz = iyz0; iyz < iyz1; iyz++)
{
- n = spline->ind[nn];
- qn = scale*atc->q[n];
+ iy = iyz/local_ndata[ZZ];
+ iz = iyz - iy*local_ndata[ZZ];
- if (bClearF)
+ ky = iy + local_offset[YY];
+
+ if (ky < maxky)
{
- atc->f[n][XX] = 0;
- atc->f[n][YY] = 0;
- atc->f[n][ZZ] = 0;
+ my = ky;
}
- if (qn != 0)
+ else
{
- fx = 0;
- fy = 0;
- fz = 0;
- idxptr = atc->idx[n];
- norder = nn*order;
+ my = (ky - ny);
+ }
+
+ by = 3.0*vol*pme->bsp_mod[YY][ky]
+ / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
+
+ kz = iz + local_offset[ZZ];
+
+ mz = kz;
+
+ bz = pme->bsp_mod[ZZ][kz];
+
+ /* 0.5 correction for corner points */
+ corner_fac = 1;
+ if (kz == 0 || kz == (nz+1)/2)
+ {
+ corner_fac = 0.5;
+ }
+
+ kxstart = local_offset[XX];
+ kxend = local_offset[XX] + local_ndata[XX];
+ if (bEnerVir)
+ {
+ /* More expensive inner loop, especially because of the
+ * storage of the mh elements in array's. Because x is the
+ * minor grid index, all mh elements depend on kx for
+ * triclinic unit cells.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ m2k = factor*m2[kx];
+ eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+ + 2.0*m2k*tmp2[kx]);
+ vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
+ tmp1[kx] = eterm*denom[kx];
+ tmp2[kx] = vterm*denom[kx];
+ }
+
+ if (!bLB)
+ {
+ t_complex *p0;
+ real struct2;
+
+ p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+ vterm = tmp2[kx];
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+
+ struct2 = 2.0*(d1*d1+d2*d2);
+
+ tmp1[kx] = eterm*struct2;
+ tmp2[kx] = vterm*struct2;
+ }
+ }
+ else
+ {
+ real *struct2 = denom;
+ real str2;
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ struct2[kx] = 0.0;
+ }
+ /* Due to symmetry we only need to calculate 4 of the 7 terms */
+ for (ig = 0; ig <= 3; ++ig)
+ {
+ t_complex *p0, *p1;
+ real scale;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ p1 = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ scale = 2.0*lb_scale_factor_symm[ig];
+ for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
+ {
+ struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
+ }
+
+ }
+ for (ig = 0; ig <= 6; ++ig)
+ {
+ t_complex *p0;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+ }
+ }
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ eterm = tmp1[kx];
+ vterm = tmp2[kx];
+ str2 = struct2[kx];
+ tmp1[kx] = eterm*str2;
+ tmp2[kx] = vterm*str2;
+ }
+ }
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ ets2 = corner_fac*tmp1[kx];
+ vterm = 2.0*factor*tmp2[kx];
+ energy += ets2;
+ ets2vf = corner_fac*vterm;
+ virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
+ virxy += ets2vf*mhx[kx]*mhy[kx];
+ virxz += ets2vf*mhx[kx]*mhz[kx];
+ viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
+ viryz += ets2vf*mhy[kx]*mhz[kx];
+ virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
+ }
+ }
+ else
+ {
+ /* We don't need to calculate the energy and the virial.
+ * In this case the triclinic overhead is small.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ m2k = factor*m2[kx];
+ eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+ + 2.0*m2k*tmp2[kx]);
+ tmp1[kx] = eterm*denom[kx];
+ }
+ gcount = (bLB ? 7 : 1);
+ for (ig = 0; ig < gcount; ++ig)
+ {
+ t_complex *p0;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+ }
+ }
+ }
+ }
+ if (bEnerVir)
+ {
+ work->vir_lj[XX][XX] = 0.25*virxx;
+ work->vir_lj[YY][YY] = 0.25*viryy;
+ work->vir_lj[ZZ][ZZ] = 0.25*virzz;
+ work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
+ work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
+ work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
+
+ /* This energy should be corrected for a charged system */
+ work->energy_lj = 0.5*energy;
+ }
+ /* Return the loop count */
+ return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
+ real *mesh_energy, matrix vir)
+{
+ /* This function sums output over threads and should therefore
+ * only be called after thread synchronization.
+ */
+ int thread;
+
+ *mesh_energy = pme->work[0].energy_q;
+ copy_mat(pme->work[0].vir_q, vir);
+
+ for (thread = 1; thread < nthread; thread++)
+ {
+ *mesh_energy += pme->work[thread].energy_q;
+ m_add(vir, pme->work[thread].vir_q, vir);
+ }
+}
+
+static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
+ real *mesh_energy, matrix vir)
+{
+ /* This function sums output over threads and should therefore
+ * only be called after thread synchronization.
+ */
+ int thread;
+
+ *mesh_energy = pme->work[0].energy_lj;
+ copy_mat(pme->work[0].vir_lj, vir);
+
+ for (thread = 1; thread < nthread; thread++)
+ {
+ *mesh_energy += pme->work[thread].energy_lj;
+ m_add(vir, pme->work[thread].vir_lj, vir);
+ }
+}
+
+
+#define DO_FSPLINE(order) \
+ for (ithx = 0; (ithx < order); ithx++) \
+ { \
+ index_x = (i0+ithx)*pny*pnz; \
+ tx = thx[ithx]; \
+ dx = dthx[ithx]; \
+ \
+ for (ithy = 0; (ithy < order); ithy++) \
+ { \
+ index_xy = index_x+(j0+ithy)*pnz; \
+ ty = thy[ithy]; \
+ dy = dthy[ithy]; \
+ fxy1 = fz1 = 0; \
+ \
+ for (ithz = 0; (ithz < order); ithz++) \
+ { \
+ gval = grid[index_xy+(k0+ithz)]; \
+ fxy1 += thz[ithz]*gval; \
+ fz1 += dthz[ithz]*gval; \
+ } \
+ fx += dx*ty*fxy1; \
+ fy += tx*dy*fxy1; \
+ fz += tx*ty*fz1; \
+ } \
+ }
+
+
+static void gather_f_bsplines(gmx_pme_t pme, real *grid,
+ gmx_bool bClearF, pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ real scale)
+{
+ /* sum forces for local particles */
+ int nn, n, ithx, ithy, ithz, i0, j0, k0;
+ int index_x, index_xy;
+ int nx, ny, nz, pnx, pny, pnz;
+ int * idxptr;
+ real tx, ty, dx, dy, coefficient;
+ real fx, fy, fz, gval;
+ real fxy1, fz1;
+ real *thx, *thy, *thz, *dthx, *dthy, *dthz;
+ int norder;
+ real rxx, ryx, ryy, rzx, rzy, rzz;
+ int order;
+
+ pme_spline_work_t *work;
+
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+ real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
+
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
+ dthz_aligned = gmx_simd4_align_r(dthz_buffer);
+#endif
+
+ work = pme->spline_work;
+
+ order = pme->pme_order;
+ thx = spline->theta[XX];
+ thy = spline->theta[YY];
+ thz = spline->theta[ZZ];
+ dthx = spline->dtheta[XX];
+ dthy = spline->dtheta[YY];
+ dthz = spline->dtheta[ZZ];
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+ pnx = pme->pmegrid_nx;
+ pny = pme->pmegrid_ny;
+ pnz = pme->pmegrid_nz;
+
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
+
+ for (nn = 0; nn < spline->n; nn++)
+ {
+ n = spline->ind[nn];
+ coefficient = scale*atc->coefficient[n];
+
+ if (bClearF)
+ {
+ atc->f[n][XX] = 0;
+ atc->f[n][YY] = 0;
+ atc->f[n][ZZ] = 0;
+ }
+ if (coefficient != 0)
+ {
+ fx = 0;
+ fy = 0;
+ fz = 0;
+ idxptr = atc->idx[n];
+ norder = nn*order;
i0 = idxptr[XX];
j0 = idxptr[YY];
switch (order)
{
case 4:
-#ifdef PME_SSE
-#ifdef PME_SSE_UNALIGNED
-#define PME_GATHER_F_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_GATHER_F_SIMD4_ORDER4
#else
-#define PME_GATHER_F_SSE_ALIGNED
+#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(4);
#endif
break;
case 5:
-#ifdef PME_SSE
-#define PME_GATHER_F_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(5);
#endif
break;
}
- atc->f[n][XX] += -qn*( fx*nx*rxx );
- atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
- atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+ atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+ atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+ atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
}
}
/* Since the energy and not forces are interpolated
int n, ithx, ithy, ithz, i0, j0, k0;
int index_x, index_xy;
int * idxptr;
- real energy, pot, tx, ty, qn, gval;
+ real energy, pot, tx, ty, coefficient, gval;
real *thx, *thy, *thz;
int norder;
int order;
energy = 0;
for (n = 0; (n < atc->n); n++)
{
- qn = atc->q[n];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = n*order;
}
}
- energy += pot*qn;
+ energy += pot*coefficient;
}
}
}
void make_bsplines(splinevec theta, splinevec dtheta, int order,
- rvec fractx[], int nr, int ind[], real charge[],
- gmx_bool bFreeEnergy)
+ rvec fractx[], int nr, int ind[], real coefficient[],
+ gmx_bool bDoSplines)
{
/* construct splines for local atoms */
int i, ii;
for (i = 0; i < nr; i++)
{
- /* With free energy we do not use the charge check.
+ /* With free energy we do not use the coefficient check.
* In most cases this will be more efficient than calling make_bsplines
- * twice, since usually more than half the particles have charges.
+ * twice, since usually more than half the particles have non-zero coefficients.
*/
ii = ind[i];
- if (bFreeEnergy || charge[ii] != 0.0)
+ if (bDoSplines || coefficient[ii] != 0.0)
{
xptr = fractx[ii];
switch (order)
int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
{
- int thread;
+ int thread, i;
if (NULL != log)
{
sfree((*pmedata)->nny);
sfree((*pmedata)->nnz);
- pmegrids_destroy(&(*pmedata)->pmegridA);
-
- sfree((*pmedata)->fftgridA);
- sfree((*pmedata)->cfftgridA);
- gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-
- if ((*pmedata)->pmegridB.grid.grid != NULL)
+ for (i = 0; i < (*pmedata)->ngrids; ++i)
{
- pmegrids_destroy(&(*pmedata)->pmegridB);
- sfree((*pmedata)->fftgridB);
- sfree((*pmedata)->cfftgridB);
- gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+ pmegrids_destroy(&(*pmedata)->pmegrid[i]);
+ sfree((*pmedata)->fftgrid[i]);
+ sfree((*pmedata)->cfftgrid[i]);
+ gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
}
+
+ sfree((*pmedata)->lb_buf1);
+ sfree((*pmedata)->lb_buf2);
+
for (thread = 0; thread < (*pmedata)->nthread; thread++)
{
free_work(&(*pmedata)->work[thread]);
return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
}
-static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
+static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
int dimind, gmx_bool bSpread)
{
int nk, k, s, thread;
if (atc->nslab > 1)
{
- /* These three allocations are not required for particle decomp. */
snew(atc->node_dest, atc->nslab);
snew(atc->node_src, atc->nslab);
setup_coordinate_communication(atc);
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
- * Because grid overlap communication only goes forward,
- * the grid the slabs for fft's should be rounded down.
+ * Since in calc_pidx we divide particles, and not grid lines,
+ * spatially uniform along dimension x or y, we need to round
+ * s2g0 down and s2g1 up.
*/
ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
*fraction_shift = fsh;
}
-static pme_spline_work_t *make_pme_spline_work(int order)
+static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
{
pme_spline_work_t *work;
-#ifdef PME_SSE
- float tmp[8];
- __m128 zero_SSE;
- int of, i;
+#ifdef PME_SIMD4_SPREAD_GATHER
+ real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+ gmx_simd4_real_t zero_S;
+ gmx_simd4_real_t real_mask_S0, real_mask_S1;
+ int of, i;
+
+ snew_aligned(work, 1, SIMD4_ALIGNMENT);
- snew_aligned(work, 1, 16);
+ tmp_aligned = gmx_simd4_align_r(tmp);
- zero_SSE = _mm_setzero_ps();
+ zero_S = gmx_simd4_setzero_r();
/* Generate bit masks to mask out the unused grid entries,
* as we only operate on order of the 8 grid entries that are
- * load into 2 SSE float registers.
+ * load into 2 SIMD registers.
*/
- for (of = 0; of < 8-(order-1); of++)
+ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
{
- for (i = 0; i < 8; i++)
+ for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
{
- tmp[i] = (i >= of && i < of+order ? 1 : 0);
+ tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
}
- work->mask_SSE0[of] = _mm_loadu_ps(tmp);
- work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
- work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
- work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+ real_mask_S0 = gmx_simd4_load_r(tmp_aligned);
+ real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+ work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+ work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
}
#else
work = NULL;
return work;
}
-static void
-gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
+void gmx_pme_check_restrictions(int pme_order,
+ int nkx, int nky, int nkz,
+ int nnodes_major,
+ int nnodes_minor,
+ gmx_bool bUseThreads,
+ gmx_bool bFatal,
+ gmx_bool *bValidSettings)
{
- int nk_new;
-
- if (*nk % nnodes != 0)
+ if (pme_order > PME_ORDER_MAX)
{
- nk_new = nnodes*(*nk/nnodes + 1);
+ if (!bFatal)
+ {
+ *bValidSettings = FALSE;
+ return;
+ }
+ gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+ pme_order, PME_ORDER_MAX);
+ }
- if (2*nk_new >= 3*(*nk))
+ if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
+ nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
+ nkz <= pme_order)
+ {
+ if (!bFatal)
{
- gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
- dim, *nk, dim, nnodes, dim);
+ *bValidSettings = FALSE;
+ return;
}
+ gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
+ pme_order);
+ }
- if (fplog != NULL)
+ /* Check for a limitation of the (current) sum_fftgrid_dd code.
+ * We only allow multiple communication pulses in dim 1, not in dim 0.
+ */
+ if (bUseThreads && (nkx < nnodes_major*pme_order &&
+ nkx != nnodes_major*(pme_order - 1)))
+ {
+ if (!bFatal)
{
- fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
- dim, *nk, dim, nnodes, dim, nk_new, dim);
+ *bValidSettings = FALSE;
+ return;
}
+ gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
+ nkx/(double)nnodes_major, pme_order);
+ }
- *nk = nk_new;
+ if (bValidSettings != NULL)
+ {
+ *bValidSettings = TRUE;
}
+
+ return;
}
int gmx_pme_init(gmx_pme_t * pmedata,
int nnodes_minor,
t_inputrec * ir,
int homenr,
- gmx_bool bFreeEnergy,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
gmx_bool bReproducible,
int nthread)
{
gmx_pme_t pme = NULL;
- pme_atomcomm_t *atc;
+ int use_threads, sum_use_threads, i;
ivec ndata;
if (debug)
}
snew(pme, 1);
- pme->redist_init = FALSE;
pme->sum_qgrid_tmp = NULL;
pme->sum_qgrid_dd_tmp = NULL;
pme->buf_nalloc = 0;
- pme->redist_buf_nalloc = 0;
pme->nnodes = 1;
pme->bPPnode = TRUE;
MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
if (pme->nnodes != nnodes_major*nnodes_minor)
{
- gmx_incons("PME node count mismatch");
+ gmx_incons("PME rank count mismatch");
}
}
else
{
if (pme->nnodes % nnodes_major != 0)
{
- gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+ gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
}
pme->ndecompdim = 2;
pme->nthread = nthread;
+ /* Check if any of the PME MPI ranks uses threads */
+ use_threads = (pme->nthread > 1 ? 1 : 0);
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+ MPI_SUM, pme->mpi_comm);
+ }
+ else
+#endif
+ {
+ sum_use_threads = use_threads;
+ }
+ pme->bUseThreads = (sum_use_threads > 0);
+
if (ir->ePBC == epbcSCREW)
{
gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
}
- pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
+ pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
+ pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
+ pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
pme->nkx = ir->nkx;
pme->nky = ir->nky;
pme->nkz = ir->nkz;
pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
pme->pme_order = ir->pme_order;
- pme->epsilon_r = ir->epsilon_r;
-
- if (pme->pme_order > PME_ORDER_MAX)
- {
- gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
- pme->pme_order, PME_ORDER_MAX);
- }
- /* Currently pme.c supports only the fft5d FFT code.
- * Therefore the grid always needs to be divisible by nnodes.
- * When the old 1D code is also supported again, change this check.
- *
- * This check should be done before calling gmx_pme_init
- * and fplog should be passed iso stderr.
- *
- if (pme->ndecompdim >= 2)
- */
- if (pme->ndecompdim >= 1)
- {
- /*
- gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
- 'x',nnodes_major,&pme->nkx);
- gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
- 'y',nnodes_minor,&pme->nky);
- */
- }
+ /* Always constant electrostatics coefficients */
+ pme->epsilon_r = ir->epsilon_r;
- if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
- pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
- pme->nkz <= pme->pme_order)
- {
- gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
- }
+ /* Always constant LJ coefficients */
+ pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
+ /* If we violate restrictions, generate a fatal error here */
+ gmx_pme_check_restrictions(pme->pme_order,
+ pme->nkx, pme->nky, pme->nkz,
+ pme->nnodes_major,
+ pme->nnodes_minor,
+ pme->bUseThreads,
+ TRUE,
+ NULL);
if (pme->nnodes > 1)
{
MPI_Type_commit(&(pme->rvec_mpi));
#endif
- /* Note that the charge spreading and force gathering, which usually
+ /* Note that the coefficient spreading and force gathering, which usually
* takes about the same amount of time as FFT+solve_pme,
* is always fully load balanced
- * (unless the charge distribution is inhomogeneous).
+ * (unless the coefficient distribution is inhomogeneous).
*/
imbal = pme_load_imbalance(pme);
"\n"
"NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
" For optimal PME load balancing\n"
- " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
- " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+ " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
"\n",
(int)((imbal-1)*100 + 0.5),
pme->nkx, pme->nky, pme->nnodes_major,
pme->nky,
(div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
- /* Check for a limitation of the (current) sum_fftgrid_dd code.
- * We only allow multiple communication pulses in dim 1, not in dim 0.
+ /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
+ * Note that gmx_pme_check_restrictions checked for this already.
*/
- if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
- pme->nkx < pme->nnodes_major*pme->pme_order))
+ if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
{
- gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
- pme->nkx/(double)pme->nnodes_major, pme->pme_order);
+ gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
}
snew(pme->bsp_mod[XX], pme->nkx);
pme->pmegrid_nz_base,
&pme->nnz, &pme->fshz);
- pmegrids_init(&pme->pmegridA,
- pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
- pme->pmegrid_nz_base,
- pme->pme_order,
- pme->nthread,
- pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
- pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
-
pme->spline_work = make_pme_spline_work(pme->pme_order);
- ndata[0] = pme->nkx;
- ndata[1] = pme->nky;
- ndata[2] = pme->nkz;
-
- /* This routine will allocate the grid data to fit the FFTs */
- gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
- &pme->fftgridA, &pme->cfftgridA,
- pme->mpi_comm_d,
- pme->overlap[0].s2g0, pme->overlap[1].s2g0,
- bReproducible, pme->nthread);
-
- if (bFreeEnergy)
- {
- pmegrids_init(&pme->pmegridB,
- pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
- pme->pmegrid_nz_base,
- pme->pme_order,
- pme->nthread,
- pme->nkx % pme->nnodes_major != 0,
- pme->nky % pme->nnodes_minor != 0);
-
- gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
- &pme->fftgridB, &pme->cfftgridB,
- pme->mpi_comm_d,
- pme->overlap[0].s2g0, pme->overlap[1].s2g0,
- bReproducible, pme->nthread);
+ ndata[0] = pme->nkx;
+ ndata[1] = pme->nky;
+ ndata[2] = pme->nkz;
+ /* It doesn't matter if we allocate too many grids here,
+ * we only allocate and use the ones we need.
+ */
+ if (EVDW_PME(ir->vdwtype))
+ {
+ pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
}
else
{
- pme->pmegridB.grid.grid = NULL;
- pme->fftgridB = NULL;
- pme->cfftgridB = NULL;
+ pme->ngrids = DO_Q;
+ }
+ snew(pme->fftgrid, pme->ngrids);
+ snew(pme->cfftgrid, pme->ngrids);
+ snew(pme->pfft_setup, pme->ngrids);
+
+ for (i = 0; i < pme->ngrids; ++i)
+ {
+ if ((i < DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+ bFreeEnergy_q)) ||
+ (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+ bFreeEnergy_lj ||
+ ir->ljpme_combination_rule == eljpmeLB)))
+ {
+ pmegrids_init(&pme->pmegrid[i],
+ pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->bUseThreads,
+ pme->nthread,
+ pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+ pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+ /* This routine will allocate the grid data to fit the FFTs */
+ gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
+ &pme->fftgrid[i], &pme->cfftgrid[i],
+ pme->mpi_comm_d,
+ bReproducible, pme->nthread);
+
+ }
}
if (!pme->bP3M)
}
/* Use atc[0] for spreading */
- init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
+ init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
if (pme->ndecompdim >= 2)
{
- init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
+ init_atomcomm(pme, &pme->atc[1], 1, FALSE);
}
if (pme->nnodes == 1)
pme_realloc_atomcomm_things(&pme->atc[0]);
}
+ pme->lb_buf1 = NULL;
+ pme->lb_buf2 = NULL;
+ pme->lb_buf_nalloc = 0;
+
{
int thread;
sfree_aligned(new->grid.grid);
new->grid.grid = old->grid.grid;
- if (new->nthread > 1 && new->nthread == old->nthread)
+ if (new->grid_th != NULL && new->nthread == old->nthread)
{
sfree_aligned(new->grid_all);
for (t = 0; t < new->nthread; t++)
}
ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
- &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+ &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
if (ret == 0)
{
/* We can easily reuse the allocated pme grids in pme_src */
- reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+ reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
/* We would like to reuse the fft grids, but that's harder */
}
}
-static void copy_local_grid(gmx_pme_t pme,
- pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+ int grid_index, int thread, real *fftgrid)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_my, fft_mz;
pmegrid_t *pmegrid;
real *grid_th;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
static void
reduce_threadgrid_overlap(gmx_pme_t pme,
const pmegrids_t *pmegrids, int thread,
- real *fftgrid, real *commbuf_x, real *commbuf_y)
+ real *fftgrid, real *commbuf_x, real *commbuf_y,
+ int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_nx, fft_ny, fft_nz;
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
- ivec ne;
+ ivec localcopy_end;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
const real *grid_th;
real *commbuf = NULL;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
for (d = 0; d < DIM; d++)
{
- ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
- local_fft_ndata[d]);
+ /* Determine up to where our thread needs to copy from the
+ * thread-local charge spreading grid to the rank-local FFT grid.
+ * This is up to our spreading grid end minus order-1 and
+ * not beyond the local FFT grid.
+ */
+ localcopy_end[d] =
+ min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
}
offx = pmegrid->offset[XX];
/* Now loop over all the thread data blocks that contribute
* to the grid region we (our thread) are operating on.
*/
- /* Note that ffy_nx/y is equal to the number of grid points
+ /* Note that fft_nx/y is equal to the number of grid points
* between the first point of our node grid and the one of the next node.
*/
for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
}
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
+ /* Determine the end of our part of the source grid */
if (!bCommX)
{
- tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ /* Use our thread local source grid and target grid part */
+ tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
+ /* Determine the end of our part of the source grid */
if (!bCommY)
{
- ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ /* Use our thread local source grid and target grid part */
+ ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fz];
oz += pmegrid_g->offset[ZZ];
- tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+ tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
if (sx == 0 && sy == 0 && sz == 0)
{
if (bCommY)
{
commbuf = commbuf_y;
- buf_my = ty1 - offy;
+ /* The y-size of the communication buffer is set by
+ * the overlap of the grid part of our local slab
+ * with the part starting at the next slab.
+ */
+ buf_my =
+ pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor+1];
if (bCommX)
{
/* We index commbuf modulo the local grid size */
}
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
pme_overlap_t *overlap;
int x, y, z, indg, indb;
/* Note that this routine is only used for forward communication.
- * Since the force gathering, unlike the charge spreading,
+ * Since the force gathering, unlike the coefficient spreading,
* can be trivially parallelized over the particles,
* the backwards process is much simpler and can use the "old"
* communication setup.
*/
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
recvptr = overlap->recvbuf;
+ if (debug != NULL)
+ {
+ fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+ local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+ }
+
#ifdef GMX_MPI
MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
send_id, ipulse,
if (debug != NULL)
{
- fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
}
static void spread_on_grid(gmx_pme_t pme,
pme_atomcomm_t *atc, pmegrids_t *grids,
gmx_bool bCalcSplines, gmx_bool bSpread,
- real *fftgrid)
+ real *fftgrid, gmx_bool bDoSplines, int grid_index)
{
int nthread, thread;
#ifdef PME_TIME_THREADS
/* Compute fftgrid index for all atoms,
* with help of some extra variables.
*/
- calc_interpolation_idx(pme, atc, start, end, thread);
+ calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
}
}
#ifdef PME_TIME_THREADS
for (thread = 0; thread < nthread; thread++)
{
splinedata_t *spline;
- pmegrid_t *grid;
+ pmegrid_t *grid = NULL;
/* make local bsplines */
- if (grids == NULL || grids->nthread == 1)
+ if (grids == NULL || !pme->bUseThreads)
{
spline = &atc->spline[0];
spline->n = atc->n;
- grid = &grids->grid;
+ if (bSpread)
+ {
+ grid = &grids->grid;
+ }
}
else
{
spline = &atc->spline[thread];
- make_thread_local_ind(atc, thread, spline);
+ if (grids->nthread == 1)
+ {
+ /* One thread, we operate on all coefficients */
+ spline->n = atc->n;
+ }
+ else
+ {
+ /* Get the indices our thread should operate on */
+ make_thread_local_ind(atc, thread, spline);
+ }
grid = &grids->grid_th[thread];
}
if (bCalcSplines)
{
make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
- atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+ atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
}
if (bSpread)
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_start();
#endif
- spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+ spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
- if (grids->nthread > 1)
+ if (pme->bUseThreads)
{
- copy_local_grid(pme, grids, thread, fftgrid);
+ copy_local_grid(pme, grids, grid_index, thread, fftgrid);
}
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_end(ct1a);
cs2 += (double)c2;
#endif
- if (bSpread && grids->nthread > 1)
+ if (bSpread && pme->bUseThreads)
{
#ifdef PME_TIME_THREADS
c3 = omp_cyc_start();
reduce_threadgrid_overlap(pme, grids, thread,
fftgrid,
pme->overlap[0].sendbuf,
- pme->overlap[1].sendbuf);
+ pme->overlap[1].sendbuf,
+ grid_index);
}
#ifdef PME_TIME_THREADS
c3 = omp_cyc_end(c3);
if (pme->nnodes > 1)
{
- /* Communicate the overlapping part of the fftgrid */
- sum_fftgrid_dd(pme, fftgrid);
+ /* Communicate the overlapping part of the fftgrid.
+ * For this communication call we need to check pme->bUseThreads
+ * to have all ranks communicate here, regardless of pme->nthread.
+ */
+ sum_fftgrid_dd(pme, fftgrid, grid_index);
}
}
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
local_fft_ndata,
local_fft_offset,
local_fft_size);
{
gmx_incons("gmx_pme_calc_energy called in parallel");
}
- if (pme->bFEP > 1)
+ if (pme->bFEP_q > 1)
{
gmx_incons("gmx_pme_calc_energy with free energy");
}
atc->pme_order = pme->pme_order;
atc->n = n;
pme_realloc_atomcomm_things(atc);
- atc->x = x;
- atc->q = q;
+ atc->x = x;
+ atc->coefficient = q;
/* We only use the A-charges grid */
- grid = &pme->pmegridA;
+ grid = &pme->pmegrid[PME_GRID_QA];
- spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+ /* Only calculate the spline coefficients, don't actually spread */
+ spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
*V = gather_energy_bsplines(pme, grid->grid.grid, atc);
}
-static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+ gmx_walltime_accounting_t walltime_accounting,
t_nrnb *nrnb, t_inputrec *ir,
- gmx_large_int_t step)
+ gmx_int64_t step)
{
/* Reset all the counters related to performance over the run */
wallcycle_stop(wcycle, ewcRUN);
}
ir->init_step = step;
wallcycle_start(wcycle, ewcRUN);
+ walltime_accounting_start(walltime_accounting);
}
*pme_ret = (*pmedata)[ind];
}
-
int gmx_pmeonly(gmx_pme_t pme,
- t_commrec *cr, t_nrnb *nrnb,
+ t_commrec *cr, t_nrnb *mynrnb,
gmx_wallcycle_t wcycle,
- real ewaldcoeff, gmx_bool bGatherOnly,
+ gmx_walltime_accounting_t walltime_accounting,
+ real ewaldcoeff_q, real ewaldcoeff_lj,
t_inputrec *ir)
{
int npmedata;
matrix box;
rvec *x_pp = NULL, *f_pp = NULL;
real *chargeA = NULL, *chargeB = NULL;
- real lambda = 0;
+ real *c6A = NULL, *c6B = NULL;
+ real *sigmaA = NULL, *sigmaB = NULL;
+ real lambda_q = 0;
+ real lambda_lj = 0;
int maxshift_x = 0, maxshift_y = 0;
- real energy, dvdlambda;
- matrix vir;
+ real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
+ matrix vir_q, vir_lj;
float cycles;
int count;
gmx_bool bEnerVir;
- gmx_large_int_t step, step_rel;
+ int pme_flags;
+ gmx_int64_t step, step_rel;
ivec grid_switch;
/* This data will only use with PME tuning, i.e. switching PME grids */
pme_pp = gmx_pme_pp_init(cr);
- init_nrnb(nrnb);
+ init_nrnb(mynrnb);
count = 0;
do /****** this is a quasi-loop over time steps! */
do
{
/* Domain decomposition */
- ret = gmx_pme_recv_q_x(pme_pp,
- &natoms,
- &chargeA, &chargeB, box, &x_pp, &f_pp,
- &maxshift_x, &maxshift_y,
- &pme->bFEP, &lambda,
- &bEnerVir,
- &step,
- grid_switch, &ewaldcoeff);
+ ret = gmx_pme_recv_coeffs_coords(pme_pp,
+ &natoms,
+ &chargeA, &chargeB,
+ &c6A, &c6B,
+ &sigmaA, &sigmaB,
+ box, &x_pp, &f_pp,
+ &maxshift_x, &maxshift_y,
+ &pme->bFEP_q, &pme->bFEP_lj,
+ &lambda_q, &lambda_lj,
+ &bEnerVir,
+ &pme_flags,
+ &step,
+ grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
if (ret == pmerecvqxSWITCHGRID)
{
if (ret == pmerecvqxRESETCOUNTERS)
{
/* Reset the cycle and flop counters */
- reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
+ reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
}
}
while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
if (count == 0)
{
wallcycle_start(wcycle, ewcRUN);
+ walltime_accounting_start(walltime_accounting);
}
wallcycle_start(wcycle, ewcPMEMESH);
- dvdlambda = 0;
- clear_mat(vir);
- gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
- cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
- &energy, lambda, &dvdlambda,
- GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+ dvdlambda_q = 0;
+ dvdlambda_lj = 0;
+ clear_mat(vir_q);
+ clear_mat(vir_lj);
+
+ gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
+ chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
+ cr, maxshift_x, maxshift_y, mynrnb, wcycle,
+ vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
+ &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
+ pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
cycles = wallcycle_stop(wcycle, ewcPMEMESH);
gmx_pme_send_force_vir_ener(pme_pp,
- f_pp, vir, energy, dvdlambda,
- cycles);
+ f_pp, vir_q, energy_q, vir_lj, energy_lj,
+ dvdlambda_q, dvdlambda_lj, cycles);
count++;
} /***** end of quasi-loop, we stop with the break above */
while (TRUE);
+ walltime_accounting_end(walltime_accounting);
+
return 0;
}
+static void
+calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
+{
+ int i;
+
+ for (i = 0; i < pme->atc[0].n; ++i)
+ {
+ real sigma4;
+
+ sigma4 = local_sigma[i];
+ sigma4 = sigma4*sigma4;
+ sigma4 = sigma4*sigma4;
+ pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
+ }
+}
+
+static void
+calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
+{
+ int i;
+
+ for (i = 0; i < pme->atc[0].n; ++i)
+ {
+ pme->atc[0].coefficient[i] *= local_sigma[i];
+ }
+}
+
+static void
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+ gmx_bool bFirst, rvec x[], real *data)
+{
+ int d;
+ pme_atomcomm_t *atc;
+ atc = &pme->atc[0];
+
+ for (d = pme->ndecompdim - 1; d >= 0; d--)
+ {
+ int n_d;
+ rvec *x_d;
+ real *param_d;
+
+ if (d == pme->ndecompdim - 1)
+ {
+ n_d = homenr;
+ x_d = x + start;
+ param_d = data;
+ }
+ else
+ {
+ n_d = pme->atc[d + 1].n;
+ x_d = atc->x;
+ param_d = atc->coefficient;
+ }
+ atc = &pme->atc[d];
+ atc->npd = n_d;
+ if (atc->npd > atc->pd_nalloc)
+ {
+ atc->pd_nalloc = over_alloc_dd(atc->npd);
+ srenew(atc->pd, atc->pd_nalloc);
+ }
+ pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+ where();
+ /* Redistribute x (only once) and qA/c6A or qB/c6B */
+ if (DOMAINDECOMP(cr))
+ {
+ dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
+ }
+ }
+}
+
int gmx_pme_do(gmx_pme_t pme,
int start, int homenr,
rvec x[], rvec f[],
real *chargeA, real *chargeB,
+ real *c6A, real *c6B,
+ real *sigmaA, real *sigmaB,
matrix box, t_commrec *cr,
int maxshift_x, int maxshift_y,
t_nrnb *nrnb, gmx_wallcycle_t wcycle,
- matrix vir, real ewaldcoeff,
- real *energy, real lambda,
- real *dvdlambda, int flags)
+ matrix vir_q, real ewaldcoeff_q,
+ matrix vir_lj, real ewaldcoeff_lj,
+ real *energy_q, real *energy_lj,
+ real lambda_q, real lambda_lj,
+ real *dvdlambda_q, real *dvdlambda_lj,
+ int flags)
{
- int q, d, i, j, ntot, npme;
+ int d, i, j, k, ntot, npme, grid_index, max_grid_index;
int nx, ny, nz;
int n_d, local_ny;
pme_atomcomm_t *atc = NULL;
real *grid = NULL;
real *ptr;
rvec *x_d, *f_d;
- real *charge = NULL, *q_d;
- real energy_AB[2];
- matrix vir_AB[2];
+ real *coefficient = NULL;
+ real energy_AB[4];
+ matrix vir_AB[4];
+ real scale, lambda;
gmx_bool bClearF;
gmx_parallel_3dfft_t pfft_setup;
real * fftgrid;
t_complex * cfftgrid;
int thread;
+ gmx_bool bFirst, bDoSplines;
+ int fep_state;
+ int fep_states_lj = pme->bFEP_lj ? 2 : 1;
const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
atc->pd_nalloc = over_alloc_dd(atc->npd);
srenew(atc->pd, atc->pd_nalloc);
}
- atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ for (d = pme->ndecompdim-1; d >= 0; d--)
+ {
+ atc = &pme->atc[d];
+ atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ }
}
else
{
+ atc = &pme->atc[0];
/* This could be necessary for TPI */
pme->atc[0].n = homenr;
+ if (DOMAINDECOMP(cr))
+ {
+ pme_realloc_atomcomm_things(atc);
+ }
+ atc->x = x;
+ atc->f = f;
}
- for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+ m_inv_ur0(box, pme->recipbox);
+ bFirst = TRUE;
+
+ /* For simplicity, we construct the splines for all particles if
+ * more than one PME calculations is needed. Some optimization
+ * could be done by keeping track of which atoms have splines
+ * constructed, and construct new splines on each pass for atoms
+ * that don't yet have them.
+ */
+
+ bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
+
+ /* We need a maximum of four separate PME calculations:
+ * grid_index=0: Coulomb PME with charges from state A
+ * grid_index=1: Coulomb PME with charges from state B
+ * grid_index=2: LJ PME with C6 from state A
+ * grid_index=3: LJ PME with C6 from state B
+ * For Lorentz-Berthelot combination rules, a separate loop is used to
+ * calculate all the terms
+ */
+
+ /* If we are doing LJ-PME with LB, we only do Q here */
+ max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
+
+ for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
{
- if (q == 0)
+ /* Check if we should do calculations at this grid_index
+ * If grid_index is odd we should be doing FEP
+ * If grid_index < 2 we should be doing electrostatic PME
+ * If grid_index >= 2 we should be doing LJ-PME
+ */
+ if ((grid_index < DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+ (grid_index == 1 && !pme->bFEP_q))) ||
+ (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+ (grid_index == 3 && !pme->bFEP_lj))))
{
- pmegrid = &pme->pmegridA;
- fftgrid = pme->fftgridA;
- cfftgrid = pme->cfftgridA;
- pfft_setup = pme->pfft_setupA;
- charge = chargeA+start;
+ continue;
}
- else
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ switch (grid_index)
{
- pmegrid = &pme->pmegridB;
- fftgrid = pme->fftgridB;
- cfftgrid = pme->cfftgridB;
- pfft_setup = pme->pfft_setupB;
- charge = chargeB+start;
+ case 0: coefficient = chargeA + start; break;
+ case 1: coefficient = chargeB + start; break;
+ case 2: coefficient = c6A + start; break;
+ case 3: coefficient = c6B + start; break;
}
+
grid = pmegrid->grid.grid;
- /* Unpack structure */
+
if (debug)
{
- fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+ fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
cr->nnodes, cr->nodeid);
fprintf(debug, "Grid = %p\n", (void*)grid);
if (grid == NULL)
}
where();
- m_inv_ur0(box, pme->recipbox);
-
if (pme->nnodes == 1)
{
- atc = &pme->atc[0];
- if (DOMAINDECOMP(cr))
- {
- atc->n = homenr;
- pme_realloc_atomcomm_things(atc);
- }
- atc->x = x;
- atc->q = charge;
- atc->f = f;
+ atc->coefficient = coefficient;
}
else
{
wallcycle_start(wcycle, ewcPME_REDISTXF);
- for (d = pme->ndecompdim-1; d >= 0; d--)
- {
- if (d == pme->ndecompdim-1)
- {
- n_d = homenr;
- x_d = x + start;
- q_d = charge;
- }
- else
- {
- n_d = pme->atc[d+1].n;
- x_d = atc->x;
- q_d = atc->q;
- }
- atc = &pme->atc[d];
- atc->npd = n_d;
- if (atc->npd > atc->pd_nalloc)
- {
- atc->pd_nalloc = over_alloc_dd(atc->npd);
- srenew(atc->pd, atc->pd_nalloc);
- }
- atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
- pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
- where();
-
- /* Redistribute x (only once) and qA or qB */
- if (DOMAINDECOMP(cr))
- {
- dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
- }
- else
- {
- pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
- }
- }
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
where();
wallcycle_stop(wcycle, ewcPME_REDISTXF);
if (debug)
{
- fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+ fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
cr->nodeid, atc->n);
}
- if (flags & GMX_PME_SPREAD_Q)
+ if (flags & GMX_PME_SPREAD)
{
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
- /* Spread the charges on a grid */
- spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+ /* Spread the coefficients on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
- if (q == 0)
+ if (bFirst)
{
inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
}
- inc_nrnb(nrnb, eNR_SPREADQBSP,
+ inc_nrnb(nrnb, eNR_SPREADBSP,
pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
- if (pme->nthread == 1)
+ if (!pme->bUseThreads)
{
wrap_periodic_pmegrid(pme, grid);
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
where();
}
#endif
- copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
wallcycle_start(wcycle, ewcPME_FFT);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
- fftgrid, cfftgrid, thread, wcycle);
+ thread, wcycle);
if (thread == 0)
{
wallcycle_stop(wcycle, ewcPME_FFT);
/* solve in k-space for our local cells */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_SOLVE);
+ wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ }
+ if (grid_index < DO_Q)
+ {
+ loop_count =
+ solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
}
- loop_count =
- solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
- box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
- bCalcEnerVir,
- pme->nthread, thread);
+ else
+ {
+ loop_count =
+ solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
+ }
+
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_SOLVE);
+ wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
where();
inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
}
wallcycle_start(wcycle, ewcPME_FFT);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
- cfftgrid, fftgrid, thread, wcycle);
+ thread, wcycle);
if (thread == 0)
{
wallcycle_stop(wcycle, ewcPME_FFT);
inc_nrnb(nrnb, eNR_FFT, 2*npme);
}
+ /* Note: this wallcycle region is closed below
+ outside an OpenMP region, so take care if
+ refactoring code here. */
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
}
- copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
}
}
/* End of thread parallel section.
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
}
#endif
where();
* atc->f is the actual force array, not a buffer,
* therefore we should not clear it.
*/
- bClearF = (q == 0 && PAR(cr));
+ lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
+ bClearF = (bFirst && PAR(cr));
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
for (thread = 0; thread < pme->nthread; thread++)
{
gather_f_bsplines(pme, grid, bClearF, atc,
&atc->spline[thread],
- pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+ pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
}
where();
inc_nrnb(nrnb, eNR_GATHERFBSP,
pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ /* Note: this wallcycle region is opened above inside an OpenMP
+ region, so take care if refactoring code here. */
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
}
/* This should only be called on the master thread
* and after the threads have synchronized.
*/
- get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ if (grid_index < 2)
+ {
+ get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+ }
+ else
+ {
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+ }
}
- } /* of q-loop */
+ bFirst = FALSE;
+ } /* of grid_index-loop */
+
+ /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
+ * seven terms. */
+
+ if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
+ {
+ /* Loop over A- and B-state if we are doing FEP */
+ for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
+ {
+ real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+ if (pme->nnodes == 1)
+ {
+ if (pme->lb_buf1 == NULL)
+ {
+ pme->lb_buf_nalloc = pme->atc[0].n;
+ snew(pme->lb_buf1, pme->lb_buf_nalloc);
+ }
+ pme->atc[0].coefficient = pme->lb_buf1;
+ switch (fep_state)
+ {
+ case 0:
+ local_c6 = c6A;
+ local_sigma = sigmaA;
+ break;
+ case 1:
+ local_c6 = c6B;
+ local_sigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
+ }
+ else
+ {
+ atc = &pme->atc[0];
+ switch (fep_state)
+ {
+ case 0:
+ RedistC6 = c6A;
+ RedistSigma = sigmaA;
+ break;
+ case 1:
+ RedistC6 = c6B;
+ RedistSigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
+ wallcycle_start(wcycle, ewcPME_REDISTXF);
+
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+ if (pme->lb_buf_nalloc < atc->n)
+ {
+ pme->lb_buf_nalloc = atc->nalloc;
+ srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+ srenew(pme->lb_buf2, pme->lb_buf_nalloc);
+ }
+ local_c6 = pme->lb_buf1;
+ for (i = 0; i < atc->n; ++i)
+ {
+ local_c6[i] = atc->coefficient[i];
+ }
+ where();
+
+ do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+ local_sigma = pme->lb_buf2;
+ for (i = 0; i < atc->n; ++i)
+ {
+ local_sigma[i] = atc->coefficient[i];
+ }
+ where();
+
+ wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ }
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+
+ /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+ for (grid_index = 2; grid_index < 9; ++grid_index)
+ {
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ calc_next_lb_coeffs(pme, local_sigma);
+ grid = pmegrid->grid.grid;
+ where();
+
+ if (flags & GMX_PME_SPREAD)
+ {
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ /* Spread the c6 on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+ if (bFirst)
+ {
+ inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+ }
+
+ inc_nrnb(nrnb, eNR_SPREADBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+ if (pme->nthread == 1)
+ {
+ wrap_periodic_pmegrid(pme, grid);
+ /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+ where();
+ }
+#endif
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
+ }
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ }
+ /*Here we start a large thread parallel region*/
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ if (flags & GMX_PME_SOLVE)
+ {
+ /* do 3d-fft */
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+ }
+ where();
+ }
+ }
+ bFirst = FALSE;
+ }
+ if (flags & GMX_PME_SOLVE)
+ {
+ /* solve in k-space for our local cells */
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ int loop_count;
+ thread = gmx_omp_get_thread_num();
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcLJPME);
+ }
+
+ loop_count =
+ solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcLJPME);
+ where();
+ inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ }
+ }
+ }
+
+ if (bCalcEnerVir)
+ {
+ /* This should only be called on the master thread and
+ * after the threads have synchronized.
+ */
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+ }
+
+ if (bCalcF)
+ {
+ bFirst = !(flags & GMX_PME_DO_COULOMB);
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+ for (grid_index = 8; grid_index >= 2; --grid_index)
+ {
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ grid = pmegrid->grid.grid;
+ calc_next_lb_coeffs(pme, local_sigma);
+ where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ /* do 3d-invfft */
+ if (thread == 0)
+ {
+ where();
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+
+ where();
+
+ if (pme->nodeid == 0)
+ {
+ ntot = pme->nkx*pme->nky*pme->nkz;
+ npme = ntot*log((real)ntot)/log(2.0);
+ inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ }
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ }
+
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+ } /*#pragma omp parallel*/
+
+ /* distribute local grid to all nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+ }
+#endif
+ where();
+
+ unwrap_periodic_pmegrid(pme, grid);
+
+ /* interpolate forces for our local atoms */
+ where();
+ bClearF = (bFirst && PAR(cr));
+ scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+ scale *= lb_scale_factor[grid_index-2];
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+ &pme->atc[0].spline[thread],
+ scale);
+ }
+ where();
+
+ inc_nrnb(nrnb, eNR_GATHERFBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+
+ bFirst = FALSE;
+ } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+ } /* if (bCalcF) */
+ } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+ } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
if (bCalcF && pme->nnodes > 1)
{
dd_pmeredist_f(pme, atc, n_d, f_d,
d == pme->ndecompdim-1 && pme->bPPnode);
}
- else
- {
- pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
- }
}
wallcycle_stop(wcycle, ewcPME_REDISTXF);
if (bCalcEnerVir)
{
- if (!pme->bFEP)
+ if (flags & GMX_PME_DO_COULOMB)
{
- *energy = energy_AB[0];
- m_add(vir, vir_AB[0], vir);
+ if (!pme->bFEP_q)
+ {
+ *energy_q = energy_AB[0];
+ m_add(vir_q, vir_AB[0], vir_q);
+ }
+ else
+ {
+ *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
+ *dvdlambda_q += energy_AB[1] - energy_AB[0];
+ for (i = 0; i < DIM; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
+ lambda_q*vir_AB[1][i][j];
+ }
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
+ }
}
else
{
- *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
- *dvdlambda += energy_AB[1] - energy_AB[0];
- for (i = 0; i < DIM; i++)
+ *energy_q = 0;
+ }
+
+ if (flags & GMX_PME_DO_LJ)
+ {
+ if (!pme->bFEP_lj)
+ {
+ *energy_lj = energy_AB[2];
+ m_add(vir_lj, vir_AB[2], vir_lj);
+ }
+ else
{
- for (j = 0; j < DIM; j++)
+ *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
+ *dvdlambda_lj += energy_AB[3] - energy_AB[2];
+ for (i = 0; i < DIM; i++)
{
- vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
- lambda*vir_AB[1][i][j];
+ for (j = 0; j < DIM; j++)
+ {
+ vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
+ }
}
}
+ if (debug)
+ {
+ fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
+ }
+ }
+ else
+ {
+ *energy_lj = 0;
}
}
- else
- {
- *energy = 0;
- }
-
- if (debug)
- {
- fprintf(debug, "PME mesh energy: %g\n", *energy);
- }
-
return 0;
}