-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
/* IMPORTANT FOR DEVELOPERS:
*
* /Erik 001109
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/pme.h"
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "gmxcomplex.h"
-#include "smalloc.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "macros.h"
#include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/fileio/futil.h"
#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
/* Turn on arbitrary width SIMD intrinsics for PME solve */
-#define PME_SIMD
+# define PME_SIMD_SOLVE
#endif
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
+#define PME_GRID_QA 0 /* Gridindex for A-state for Q */
+#define PME_GRID_C6A 2 /* Gridindex for A-state for LJ */
+#define DO_Q 2 /* Electrostatic grids have index q<2 */
+#define DO_Q_AND_LJ 4 /* non-LB LJ grids have index 2 <= q < 4 */
+#define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
+
+/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
+const real lb_scale_factor[] = {
+ 1.0/64, 6.0/64, 15.0/64, 20.0/64,
+ 15.0/64, 6.0/64, 1.0/64
+};
+
+/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
+const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
+
/* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#if (defined GMX_SIMD4_HAVE_REAL)
/* Do PME spread and gather with 4-wide SIMD.
* NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
*/
-#define PME_SIMD4_SPREAD_GATHER
+# define PME_SIMD4_SPREAD_GATHER
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
/* With PME-order=4 on x86, unaligned load+store is slightly faster
* than doubling all SIMD operations when using aligned load+store.
*/
-#define PME_SIMD4_UNALIGNED
-#endif
+# define PME_SIMD4_UNALIGNED
+# endif
#endif
#define DFT_TOL 1e-7
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
-#define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
+# define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
#else
/* We can use any alignment, apart from 0, so we use 4 reals */
-#define SIMD4_ALIGNMENT (4*sizeof(real))
+# define SIMD4_ALIGNMENT (4*sizeof(real))
#endif
/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
int n;
int nalloc;
rvec *x;
- real *q;
+ real *coefficient;
rvec *f;
gmx_bool bSpread; /* These coordinates are used for spreading */
int pme_order;
ivec *idx;
- rvec *fractx; /* Fractional coordinate relative to the
- * lower cell boundary
+ rvec *fractx; /* Fractional coordinate relative to
+ * the lower cell boundary
*/
int nthread;
- int *thread_idx; /* Which thread should spread which charge */
+ int *thread_idx; /* Which thread should spread which coefficient */
thread_plist_t *thread_plist;
splinedata_t *spline;
} pme_atomcomm_t;
ivec nthread_comm; /* The number of threads to communicate with */
} pmegrids_t;
-
typedef struct {
#ifdef PME_SIMD4_SPREAD_GATHER
/* Masks for 4-wide SIMD aligned spreading and gathering */
- gmx_simd4_pb mask_S0[6], mask_S1[6];
+ gmx_simd4_bool_t mask_S0[6], mask_S1[6];
#else
- int dummy; /* C89 requires that struct has at least one member */
+ int dummy; /* C89 requires that struct has at least one member */
#endif
} pme_spline_work_t;
real * denom;
real * tmp1_alloc;
real * tmp1;
+ real * tmp2;
real * eterm;
real * m2inv;
- real energy;
- matrix vir;
+ real energy_q;
+ matrix vir_q;
+ real energy_lj;
+ matrix vir_lj;
} pme_work_t;
typedef struct gmx_pme {
gmx_bool bPPnode; /* Node also does particle-particle forces */
gmx_bool bFEP; /* Compute Free energy contribution */
+ gmx_bool bFEP_q;
+ gmx_bool bFEP_lj;
int nkx, nky, nkz; /* Grid dimensions */
gmx_bool bP3M; /* Do P3M: optimize the influence function */
int pme_order;
real epsilon_r;
- pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
- pmegrids_t pmegridB;
- /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+ int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
+
+ int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
+
+ pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
+ * includes overlap Grid indices are ordered as
+ * follows:
+ * 0: Coloumb PME, state A
+ * 1: Coloumb PME, state B
+ * 2-8: LJ-PME
+ * This can probably be done in a better way
+ * but this simple hack works for now
+ */
+ /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
int pmegrid_nx, pmegrid_ny, pmegrid_nz;
/* pmegrid_nz might be larger than strictly necessary to ensure
* memory alignment, pmegrid_nz_base gives the real base size.
int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
/* Work data for spreading and gathering */
- pme_spline_work_t *spline_work;
+ pme_spline_work_t *spline_work;
+
+ real **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+ /* inside the interpolation grid, but separate for 2D PME decomp. */
+ int fftgrid_nx, fftgrid_ny, fftgrid_nz;
- real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
- real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
- int fftgrid_nx, fftgrid_ny, fftgrid_nz;
+ t_complex **cfftgrid; /* Grids for complex FFT data */
- t_complex *cfftgridA; /* Grids for complex FFT data */
- t_complex *cfftgridB;
- int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+ int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
- gmx_parallel_3dfft_t pfft_setupA;
- gmx_parallel_3dfft_t pfft_setupB;
+ gmx_parallel_3dfft_t *pfft_setup;
- int *nnx, *nny, *nnz;
- real *fshx, *fshy, *fshz;
+ int *nnx, *nny, *nnz;
+ real *fshx, *fshy, *fshz;
- pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
- matrix recipbox;
- splinevec bsp_mod;
+ pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
+ matrix recipbox;
+ splinevec bsp_mod;
+ /* Buffers to store data for local atoms for L-B combination rule
+ * calculations in LJ-PME. lb_buf1 stores either the coefficients
+ * for spreading/gathering (in serial), or the C6 coefficient for
+ * local atoms (in parallel). lb_buf2 is only used in parallel,
+ * and stores the sigma values for local atoms. */
+ real *lb_buf1, *lb_buf2;
+ int lb_buf_nalloc; /* Allocation size for the above buffers. */
- pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+ pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
- pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
+ pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
- rvec *bufv; /* Communication buffer */
- real *bufr; /* Communication buffer */
- int buf_nalloc; /* The communication buffer size */
+ rvec *bufv; /* Communication buffer */
+ real *bufr; /* Communication buffer */
+ int buf_nalloc; /* The communication buffer size */
/* thread local work data for solve_pme */
pme_work_t *work;
- /* Work data for PME_redist */
- gmx_bool redist_init;
- int * scounts;
- int * rcounts;
- int * sdispls;
- int * rdispls;
- int * sidx;
- int * idxa;
- real * redist_buf;
- int redist_buf_nalloc;
-
/* Work data for sum_qgrid */
real * sum_qgrid_tmp;
real * sum_qgrid_dd_tmp;
} t_gmx_pme;
-
static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
- int start, int end, int thread)
+ int start, int grid_index, int end, int thread)
{
int i;
int *idxptr, tix, tiy, tiz;
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
- g2tx = pme->pmegridA.g2t[XX];
- g2ty = pme->pmegridA.g2t[YY];
- g2tz = pme->pmegridA.g2t[ZZ];
+ g2tx = pme->pmegrid[grid_index].g2t[XX];
+ g2ty = pme->pmegrid[grid_index].g2t[YY];
+ g2tz = pme->pmegrid[grid_index].g2t[ZZ];
bThreads = (atc->nthread > 1);
if (bThreads)
if (atc->nslab > 1)
{
srenew(atc->x, atc->nalloc);
- srenew(atc->q, atc->nalloc);
+ srenew(atc->coefficient, atc->nalloc);
srenew(atc->f, atc->nalloc);
for (i = nalloc_old; i < atc->nalloc; i++)
{
}
}
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
- int n, gmx_bool bXF, rvec *x_f, real *charge,
- pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate */
-{
- int *idxa;
- int i, ii;
-
- if (FALSE == pme->redist_init)
- {
- snew(pme->scounts, atc->nslab);
- snew(pme->rcounts, atc->nslab);
- snew(pme->sdispls, atc->nslab);
- snew(pme->rdispls, atc->nslab);
- snew(pme->sidx, atc->nslab);
- pme->redist_init = TRUE;
- }
- if (n > pme->redist_buf_nalloc)
- {
- pme->redist_buf_nalloc = over_alloc_dd(n);
- srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
- }
-
- pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
- if (forw && bXF)
- {
- /* forward, redistribution from pp to pme */
-
- /* Calculate send counts and exchange them with other nodes */
- for (i = 0; (i < atc->nslab); i++)
- {
- pme->scounts[i] = 0;
- }
- for (i = 0; (i < n); i++)
- {
- pme->scounts[pme->idxa[i]]++;
- }
- MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
- /* Calculate send and receive displacements and index into send
- buffer */
- pme->sdispls[0] = 0;
- pme->rdispls[0] = 0;
- pme->sidx[0] = 0;
- for (i = 1; i < atc->nslab; i++)
- {
- pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
- pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
- pme->sidx[i] = pme->sdispls[i];
- }
- /* Total # of particles to be received */
- atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
- pme_realloc_atomcomm_things(atc);
-
- /* Copy particle coordinates into send buffer and exchange*/
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii+XX] = x_f[i][XX];
- pme->redist_buf[ii+YY] = x_f[i][YY];
- pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
- pme->rvec_mpi, atc->mpi_comm);
- }
- if (forw)
- {
- /* Copy charge into send buffer and exchange*/
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = pme->sidx[pme->idxa[i]];
- pme->sidx[pme->idxa[i]]++;
- pme->redist_buf[ii] = charge[i];
- }
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
- atc->q, pme->rcounts, pme->rdispls, mpi_type,
- atc->mpi_comm);
- }
- else /* backward, redistribution from pme to pp */
- {
- MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
- pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->mpi_comm);
-
- /* Copy data from receive buffer */
- for (i = 0; i < atc->nslab; i++)
- {
- pme->sidx[i] = pme->sdispls[i];
- }
- for (i = 0; (i < n); i++)
- {
- ii = DIM*pme->sidx[pme->idxa[i]];
- x_f[i][XX] += pme->redist_buf[ii+XX];
- x_f[i][YY] += pme->redist_buf[ii+YY];
- x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
- pme->sidx[pme->idxa[i]]++;
- }
- }
-#endif
-}
-
-static void pme_dd_sendrecv(pme_atomcomm_t *atc,
- gmx_bool bBackward, int shift,
- void *buf_s, int nbyte_s,
- void *buf_r, int nbyte_r)
+static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
+ gmx_bool gmx_unused bBackward, int gmx_unused shift,
+ void gmx_unused *buf_s, int gmx_unused nbyte_s,
+ void gmx_unused *buf_r, int gmx_unused nbyte_r)
{
#ifdef GMX_MPI
int dest, src;
#endif
}
-static void dd_pmeredist_x_q(gmx_pme_t pme,
- int n, gmx_bool bX, rvec *x, real *charge,
- pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+ int n, gmx_bool bX, rvec *x, real *data,
+ pme_atomcomm_t *atc)
{
int *commnode, *buf_index;
int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
{
if (atc->count[atc->nodeid] + nsend != n)
{
- gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+ gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
"This usually means that your system is not well equilibrated.",
n - (atc->count[atc->nodeid] + nsend),
pme->nodeid, 'x'+atc->dimind);
/* Communicate the count */
if (debug)
{
- fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+ fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
atc->dimind, atc->nodeid, commnode[i], scount);
}
pme_dd_sendrecv(atc, FALSE, i,
{
copy_rvec(x[i], atc->x[local_pos]);
}
- atc->q[local_pos] = charge[i];
+ atc->coefficient[local_pos] = data[i];
local_pos++;
}
else
{
copy_rvec(x[i], pme->bufv[buf_index[node]]);
}
- pme->bufr[buf_index[node]] = charge[i];
+ pme->bufr[buf_index[node]] = data[i];
buf_index[node]++;
}
}
pme->bufv[buf_pos], scount*sizeof(rvec),
atc->x[local_pos], rcount*sizeof(rvec));
}
- /* Communicate the charges */
+ /* Communicate the coefficients */
pme_dd_sendrecv(atc, FALSE, i,
pme->bufr+buf_pos, scount*sizeof(real),
- atc->q+local_pos, rcount*sizeof(real));
+ atc->coefficient+local_pos, rcount*sizeof(real));
buf_pos += scount;
local_pos += atc->rcount[i];
}
}
#ifdef GMX_MPI
-static void
-gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
{
pme_overlap_t *overlap;
int send_index0, send_nindex;
/* Since we have already (un)wrapped the overlap in the z-dimension,
* we only have to communicate 0 to nkz (not pmegrid_nz).
*/
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
/* Copy data to contiguous send buffer */
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_iy,
send_index0-pme->pmegrid_start_iy,
/* Get data from contiguous recv buffer */
if (debug)
{
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_iy,
recv_index0-pme->pmegrid_start_iy,
for (k = 0; k < pme->nkz; k++)
{
iz = k;
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
}
for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
{
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
send_id = overlap->send_id[ipulse];
recv_id = overlap->recv_id[ipulse];
if (debug)
{
- fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, send_id,
pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix,
send_index0-pme->pmegrid_start_ix+send_nindex);
- fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+ fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
pme->nodeid, overlap->nodeid, recv_id,
pme->pmegrid_start_ix,
recv_index0-pme->pmegrid_start_ix,
overlap->mpi_comm, &stat);
/* ADD data from contiguous recv buffer */
- if (direction == GMX_SUM_QGRID_FORWARD)
+ if (direction == GMX_SUM_GRID_FORWARD)
{
p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
for (i = 0; i < recv_nindex*datasize; i++)
#endif
-static int
-copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
ivec local_pme_size;
int pmeidx, fftidx;
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
{
#ifdef DEBUG_PME
FILE *fp, *fp2;
- char fn[STRLEN], format[STRLEN];
+ char fn[STRLEN];
real val;
sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
- fp = ffopen(fn, "w");
+ fp = gmx_ffopen(fn, "w");
sprintf(fn, "pmegrid%d.txt", pme->nodeid);
- fp2 = ffopen(fn, "w");
- sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+ fp2 = gmx_ffopen(fn, "w");
#endif
for (ix = 0; ix < local_fft_ndata[XX]; ix++)
val = 100*pmegrid[pmeidx];
if (pmegrid[pmeidx] != 0)
{
- fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
- 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+ gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+ 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
}
if (pmegrid[pmeidx] != 0)
{
}
}
#ifdef DEBUG_PME
- ffclose(fp);
- ffclose(fp2);
+ gmx_ffclose(fp);
+ gmx_ffclose(fp2);
#endif
}
return 0;
}
-static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
- int nthread, int thread)
+static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
+ int nthread, int thread)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
ivec local_pme_size;
c1 = omp_cyc_start();
#endif
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
}
-static void
-wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
{
int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
}
-static void
-unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
{
int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
for (ithx = 0; (ithx < order); ithx++) \
{ \
index_x = (i0+ithx)*pny*pnz; \
- valx = qn*thx[ithx]; \
+ valx = coefficient*thx[ithx]; \
\
for (ithy = 0; (ithy < order); ithy++) \
{ \
}
-static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
- pme_atomcomm_t *atc,
- splinedata_t *spline,
- pme_spline_work_t gmx_unused *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
+ pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ pme_spline_work_t gmx_unused *work)
{
- /* spread charges from home atoms to local grid */
+ /* spread coefficients from home atoms to local grid */
real *grid;
pme_overlap_t *ol;
int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
int * idxptr;
int order, norder, index_x, index_xy, index_xyz;
- real valx, valxy, qn;
+ real valx, valxy, coefficient;
real *thx, *thy, *thz;
int localsize, bndsize;
int pnx, pny, pnz, ndatatot;
int offx, offy, offz;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
#endif
pnx = pmegrid->s[XX];
for (nn = 0; nn < spline->n; nn++)
{
- n = spline->ind[nn];
- qn = atc->q[n];
+ n = spline->ind[nn];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = nn*order;
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_BSPLINE(5);
#endif
/* Allocate an aligned pointer for SIMD operations, including extra
* elements at the end for padding.
*/
-#ifdef PME_SIMD
- simd_width = GMX_SIMD_WIDTH_HERE;
+#ifdef PME_SIMD_SOLVE
+ simd_width = GMX_SIMD_REAL_WIDTH;
#else
/* We can use any alignment, apart from 0, so we use 4 */
simd_width = 4;
#endif
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
+ sfree_aligned(work->tmp2);
sfree_aligned(work->eterm);
snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real));
+ snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real));
snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
srenew(work->m2inv, work->nalloc);
}
sfree(work->m2);
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
+ sfree_aligned(work->tmp2);
sfree_aligned(work->eterm);
sfree(work->m2inv);
}
-#ifdef PME_SIMD
+#if defined PME_SIMD_SOLVE
/* Calculate exponentials through SIMD */
-inline static void calc_exponentials(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
{
- const gmx_mm_pr two = gmx_set1_pr(2.0);
- gmx_mm_pr f_simd;
- gmx_mm_pr lu;
- gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+ const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+ gmx_simd_real_t f_simd;
+ gmx_simd_real_t lu;
+ gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_simd = gmx_set1_pr(f);
+ f_simd = gmx_simd_set1_r(f);
/* We only need to calculate from start. But since start is 0 or 1
* and we want to use aligned loads/stores, we always start from 0.
*/
- for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
{
- tmp_d1 = gmx_load_pr(d_aligned+kx);
- d_inv = gmx_inv_pr(tmp_d1);
- tmp_r = gmx_load_pr(r_aligned+kx);
- tmp_r = gmx_exp_pr(tmp_r);
- tmp_e = gmx_mul_pr(f_simd, d_inv);
- tmp_e = gmx_mul_pr(tmp_e, tmp_r);
- gmx_store_pr(e_aligned+kx, tmp_e);
+ tmp_d1 = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d1);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ tmp_e = gmx_simd_mul_r(f_simd, d_inv);
+ tmp_e = gmx_simd_mul_r(tmp_e, tmp_r);
+ gmx_simd_store_r(e_aligned+kx, tmp_e);
}
}
}
#else
-inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
{
int kx;
for (kx = start; kx < end; kx++)
}
#endif
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+{
+ gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+ const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
+ int kx;
+ for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
+ {
+ /* We only need to calculate from start. But since start is 0 or 1
+ * and we want to use aligned loads/stores, we always start from 0.
+ */
+ tmp_d = gmx_simd_load_r(d_aligned+kx);
+ d_inv = gmx_simd_inv_r(tmp_d);
+ gmx_simd_store_r(d_aligned+kx, d_inv);
+ tmp_r = gmx_simd_load_r(r_aligned+kx);
+ tmp_r = gmx_simd_exp_r(tmp_r);
+ gmx_simd_store_r(r_aligned+kx, tmp_r);
+ tmp_mk = gmx_simd_load_r(factor_aligned+kx);
+ tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+ gmx_simd_store_r(factor_aligned+kx, tmp_fac);
+ }
+}
+#else
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+{
+ int kx;
+ real mk;
+ for (kx = start; kx < end; kx++)
+ {
+ d[kx] = 1.0/d[kx];
+ }
+
+ for (kx = start; kx < end; kx++)
+ {
+ r[kx] = exp(r[kx]);
+ }
+
+ for (kx = start; kx < end; kx++)
+ {
+ mk = tmp2[kx];
+ tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
+ }
+}
+#endif
static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
real ewaldcoeff, real vol,
nz = pme->nkz;
/* Dimensions should be identical for A/B grid, so we just use A here */
- gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
complex_order,
local_ndata,
local_offset,
m2inv[kx] = 1.0/m2[kx];
}
- calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
for (kx = kxstart; kx < kxend; kx++, p0++)
{
tmp1[kx] = -factor*m2k;
}
- calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+ calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
for (kx = kxstart; kx < kxend; kx++, p0++)
{
* experiencing problems on semiisotropic membranes.
* IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
*/
- work->vir[XX][XX] = 0.25*virxx;
- work->vir[YY][YY] = 0.25*viryy;
- work->vir[ZZ][ZZ] = 0.25*virzz;
- work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
- work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
- work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+ work->vir_q[XX][XX] = 0.25*virxx;
+ work->vir_q[YY][YY] = 0.25*viryy;
+ work->vir_q[ZZ][ZZ] = 0.25*virzz;
+ work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
+ work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
+ work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
+
+ /* This energy should be corrected for a charged system */
+ work->energy_q = 0.5*energy;
+ }
+
+ /* Return the loop count */
+ return local_ndata[YY]*local_ndata[XX];
+}
+
+static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
+ real ewaldcoeff, real vol,
+ gmx_bool bEnerVir, int nthread, int thread)
+{
+ /* do recip sum over local cells in grid */
+ /* y major, z middle, x minor or continuous */
+ int ig, gcount;
+ int kx, ky, kz, maxkx, maxky, maxkz;
+ int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
+ real mx, my, mz;
+ real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+ real ets2, ets2vf;
+ real eterm, vterm, d1, d2, energy = 0;
+ real by, bz;
+ real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
+ real rxx, ryx, ryy, rzx, rzy, rzz;
+ real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
+ real mhxk, mhyk, mhzk, m2k;
+ real mk;
+ pme_work_t *work;
+ real corner_fac;
+ ivec complex_order;
+ ivec local_ndata, local_offset, local_size;
+ nx = pme->nkx;
+ ny = pme->nky;
+ nz = pme->nkz;
+
+ /* Dimensions should be identical for A/B grid, so we just use A here */
+ gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
+ complex_order,
+ local_ndata,
+ local_offset,
+ local_size);
+ rxx = pme->recipbox[XX][XX];
+ ryx = pme->recipbox[YY][XX];
+ ryy = pme->recipbox[YY][YY];
+ rzx = pme->recipbox[ZZ][XX];
+ rzy = pme->recipbox[ZZ][YY];
+ rzz = pme->recipbox[ZZ][ZZ];
+
+ maxkx = (nx+1)/2;
+ maxky = (ny+1)/2;
+ maxkz = nz/2+1;
+
+ work = &pme->work[thread];
+ mhx = work->mhx;
+ mhy = work->mhy;
+ mhz = work->mhz;
+ m2 = work->m2;
+ denom = work->denom;
+ tmp1 = work->tmp1;
+ tmp2 = work->tmp2;
+
+ iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
+ iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+
+ for (iyz = iyz0; iyz < iyz1; iyz++)
+ {
+ iy = iyz/local_ndata[ZZ];
+ iz = iyz - iy*local_ndata[ZZ];
+
+ ky = iy + local_offset[YY];
+
+ if (ky < maxky)
+ {
+ my = ky;
+ }
+ else
+ {
+ my = (ky - ny);
+ }
+
+ by = 3.0*vol*pme->bsp_mod[YY][ky]
+ / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
+
+ kz = iz + local_offset[ZZ];
+
+ mz = kz;
+
+ bz = pme->bsp_mod[ZZ][kz];
+
+ /* 0.5 correction for corner points */
+ corner_fac = 1;
+ if (kz == 0 || kz == (nz+1)/2)
+ {
+ corner_fac = 0.5;
+ }
+
+ kxstart = local_offset[XX];
+ kxend = local_offset[XX] + local_ndata[XX];
+ if (bEnerVir)
+ {
+ /* More expensive inner loop, especially because of the
+ * storage of the mh elements in array's. Because x is the
+ * minor grid index, all mh elements depend on kx for
+ * triclinic unit cells.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ m2k = factor*m2[kx];
+ eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+ + 2.0*m2k*tmp2[kx]);
+ vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
+ tmp1[kx] = eterm*denom[kx];
+ tmp2[kx] = vterm*denom[kx];
+ }
+
+ if (!bLB)
+ {
+ t_complex *p0;
+ real struct2;
+
+ p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+ vterm = tmp2[kx];
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+
+ struct2 = 2.0*(d1*d1+d2*d2);
+
+ tmp1[kx] = eterm*struct2;
+ tmp2[kx] = vterm*struct2;
+ }
+ }
+ else
+ {
+ real *struct2 = denom;
+ real str2;
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ struct2[kx] = 0.0;
+ }
+ /* Due to symmetry we only need to calculate 4 of the 7 terms */
+ for (ig = 0; ig <= 3; ++ig)
+ {
+ t_complex *p0, *p1;
+ real scale;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ p1 = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ scale = 2.0*lb_scale_factor_symm[ig];
+ for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
+ {
+ struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
+ }
+
+ }
+ for (ig = 0; ig <= 6; ++ig)
+ {
+ t_complex *p0;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+ }
+ }
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ eterm = tmp1[kx];
+ vterm = tmp2[kx];
+ str2 = struct2[kx];
+ tmp1[kx] = eterm*str2;
+ tmp2[kx] = vterm*str2;
+ }
+ }
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ ets2 = corner_fac*tmp1[kx];
+ vterm = 2.0*factor*tmp2[kx];
+ energy += ets2;
+ ets2vf = corner_fac*vterm;
+ virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
+ virxy += ets2vf*mhx[kx]*mhy[kx];
+ virxz += ets2vf*mhx[kx]*mhz[kx];
+ viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
+ viryz += ets2vf*mhy[kx]*mhz[kx];
+ virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
+ }
+ }
+ else
+ {
+ /* We don't need to calculate the energy and the virial.
+ * In this case the triclinic overhead is small.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+
+ for (kx = kxstart; kx < maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ for (kx = maxkx; kx < kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ m2[kx] = m2k;
+ denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ tmp2[kx] = sqrt(factor*m2k);
+ }
+
+ calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+ for (kx = kxstart; kx < kxend; kx++)
+ {
+ m2k = factor*m2[kx];
+ eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+ + 2.0*m2k*tmp2[kx]);
+ tmp1[kx] = eterm*denom[kx];
+ }
+ gcount = (bLB ? 7 : 1);
+ for (ig = 0; ig < gcount; ++ig)
+ {
+ t_complex *p0;
+
+ p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+ for (kx = kxstart; kx < kxend; kx++, p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ eterm = tmp1[kx];
+
+ p0->re = d1*eterm;
+ p0->im = d2*eterm;
+ }
+ }
+ }
+ }
+ if (bEnerVir)
+ {
+ work->vir_lj[XX][XX] = 0.25*virxx;
+ work->vir_lj[YY][YY] = 0.25*viryy;
+ work->vir_lj[ZZ][ZZ] = 0.25*virzz;
+ work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
+ work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
+ work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
/* This energy should be corrected for a charged system */
- work->energy = 0.5*energy;
+ work->energy_lj = 0.5*energy;
}
+ /* Return the loop count */
+ return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
+ real *mesh_energy, matrix vir)
+{
+ /* This function sums output over threads and should therefore
+ * only be called after thread synchronization.
+ */
+ int thread;
+
+ *mesh_energy = pme->work[0].energy_q;
+ copy_mat(pme->work[0].vir_q, vir);
- /* Return the loop count */
- return local_ndata[YY]*local_ndata[XX];
+ for (thread = 1; thread < nthread; thread++)
+ {
+ *mesh_energy += pme->work[thread].energy_q;
+ m_add(vir, pme->work[thread].vir_q, vir);
+ }
}
-static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
- real *mesh_energy, matrix vir)
+static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
+ real *mesh_energy, matrix vir)
{
- /* This function sums output over threads
- * and should therefore only be called after thread synchronization.
+ /* This function sums output over threads and should therefore
+ * only be called after thread synchronization.
*/
int thread;
- *mesh_energy = pme->work[0].energy;
- copy_mat(pme->work[0].vir, vir);
+ *mesh_energy = pme->work[0].energy_lj;
+ copy_mat(pme->work[0].vir_lj, vir);
for (thread = 1; thread < nthread; thread++)
{
- *mesh_energy += pme->work[thread].energy;
- m_add(vir, pme->work[thread].vir, vir);
+ *mesh_energy += pme->work[thread].energy_lj;
+ m_add(vir, pme->work[thread].vir_lj, vir);
}
}
+
#define DO_FSPLINE(order) \
for (ithx = 0; (ithx < order); ithx++) \
{ \
int index_x, index_xy;
int nx, ny, nz, pnx, pny, pnz;
int * idxptr;
- real tx, ty, dx, dy, qn;
+ real tx, ty, dx, dy, coefficient;
real fx, fy, fz, gval;
real fxy1, fz1;
real *thx, *thy, *thz, *dthx, *dthy, *dthz;
pme_spline_work_t *work;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
- real dthz_buffer[12], *dthz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+ real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
- dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
+ dthz_aligned = gmx_simd4_align_r(dthz_buffer);
#endif
work = pme->spline_work;
for (nn = 0; nn < spline->n; nn++)
{
- n = spline->ind[nn];
- qn = scale*atc->q[n];
+ n = spline->ind[nn];
+ coefficient = scale*atc->coefficient[n];
if (bClearF)
{
atc->f[n][YY] = 0;
atc->f[n][ZZ] = 0;
}
- if (qn != 0)
+ if (coefficient != 0)
{
fx = 0;
fy = 0;
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
#else
DO_FSPLINE(5);
#endif
break;
}
- atc->f[n][XX] += -qn*( fx*nx*rxx );
- atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
- atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+ atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+ atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+ atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
}
}
/* Since the energy and not forces are interpolated
int n, ithx, ithy, ithz, i0, j0, k0;
int index_x, index_xy;
int * idxptr;
- real energy, pot, tx, ty, qn, gval;
+ real energy, pot, tx, ty, coefficient, gval;
real *thx, *thy, *thz;
int norder;
int order;
energy = 0;
for (n = 0; (n < atc->n); n++)
{
- qn = atc->q[n];
+ coefficient = atc->coefficient[n];
- if (qn != 0)
+ if (coefficient != 0)
{
idxptr = atc->idx[n];
norder = n*order;
}
}
- energy += pot*qn;
+ energy += pot*coefficient;
}
}
}
void make_bsplines(splinevec theta, splinevec dtheta, int order,
- rvec fractx[], int nr, int ind[], real charge[],
- gmx_bool bFreeEnergy)
+ rvec fractx[], int nr, int ind[], real coefficient[],
+ gmx_bool bDoSplines)
{
/* construct splines for local atoms */
int i, ii;
for (i = 0; i < nr; i++)
{
- /* With free energy we do not use the charge check.
+ /* With free energy we do not use the coefficient check.
* In most cases this will be more efficient than calling make_bsplines
- * twice, since usually more than half the particles have charges.
+ * twice, since usually more than half the particles have non-zero coefficients.
*/
ii = ind[i];
- if (bFreeEnergy || charge[ii] != 0.0)
+ if (bDoSplines || coefficient[ii] != 0.0)
{
xptr = fractx[ii];
switch (order)
int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
{
- int thread;
+ int thread, i;
if (NULL != log)
{
sfree((*pmedata)->nny);
sfree((*pmedata)->nnz);
- pmegrids_destroy(&(*pmedata)->pmegridA);
-
- sfree((*pmedata)->fftgridA);
- sfree((*pmedata)->cfftgridA);
- gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-
- if ((*pmedata)->pmegridB.grid.grid != NULL)
+ for (i = 0; i < (*pmedata)->ngrids; ++i)
{
- pmegrids_destroy(&(*pmedata)->pmegridB);
- sfree((*pmedata)->fftgridB);
- sfree((*pmedata)->cfftgridB);
- gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+ pmegrids_destroy(&(*pmedata)->pmegrid[i]);
+ sfree((*pmedata)->fftgrid[i]);
+ sfree((*pmedata)->cfftgrid[i]);
+ gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
}
+
+ sfree((*pmedata)->lb_buf1);
+ sfree((*pmedata)->lb_buf2);
+
for (thread = 0; thread < (*pmedata)->nthread; thread++)
{
free_work(&(*pmedata)->work[thread]);
if (atc->nslab > 1)
{
- /* These three allocations are not required for particle decomp. */
snew(atc->node_dest, atc->nslab);
snew(atc->node_src, atc->nslab);
setup_coordinate_communication(atc);
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
- * Because grid overlap communication only goes forward,
- * the grid the slabs for fft's should be rounded down.
+ * Since in calc_pidx we divide particles, and not grid lines,
+ * spatially uniform along dimension x or y, we need to round
+ * s2g0 down and s2g1 up.
*/
ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
pme_spline_work_t *work;
#ifdef PME_SIMD4_SPREAD_GATHER
- real tmp[12], *tmp_aligned;
- gmx_simd4_pr zero_S;
- gmx_simd4_pr real_mask_S0, real_mask_S1;
- int of, i;
+ real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+ gmx_simd4_real_t zero_S;
+ gmx_simd4_real_t real_mask_S0, real_mask_S1;
+ int of, i;
snew_aligned(work, 1, SIMD4_ALIGNMENT);
- tmp_aligned = gmx_simd4_align_real(tmp);
+ tmp_aligned = gmx_simd4_align_r(tmp);
- zero_S = gmx_simd4_setzero_pr();
+ zero_S = gmx_simd4_setzero_r();
/* Generate bit masks to mask out the unused grid entries,
* as we only operate on order of the 8 grid entries that are
* load into 2 SIMD registers.
*/
- for (of = 0; of < 8-(order-1); of++)
+ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
{
- for (i = 0; i < 8; i++)
+ for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
{
tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
}
- real_mask_S0 = gmx_simd4_load_pr(tmp_aligned);
- real_mask_S1 = gmx_simd4_load_pr(tmp_aligned+4);
- work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
- work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
+ real_mask_S0 = gmx_simd4_load_r(tmp_aligned);
+ real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+ work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+ work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
}
#else
work = NULL;
*bValidSettings = FALSE;
return;
}
- gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+ gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
nkx/(double)nnodes_major, pme_order);
}
int nnodes_minor,
t_inputrec * ir,
int homenr,
- gmx_bool bFreeEnergy,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
gmx_bool bReproducible,
int nthread)
{
gmx_pme_t pme = NULL;
- int use_threads, sum_use_threads;
+ int use_threads, sum_use_threads, i;
ivec ndata;
if (debug)
}
snew(pme, 1);
- pme->redist_init = FALSE;
pme->sum_qgrid_tmp = NULL;
pme->sum_qgrid_dd_tmp = NULL;
pme->buf_nalloc = 0;
- pme->redist_buf_nalloc = 0;
pme->nnodes = 1;
pme->bPPnode = TRUE;
MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
if (pme->nnodes != nnodes_major*nnodes_minor)
{
- gmx_incons("PME node count mismatch");
+ gmx_incons("PME rank count mismatch");
}
}
else
{
if (pme->nnodes % nnodes_major != 0)
{
- gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+ gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
}
pme->ndecompdim = 2;
gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
}
- pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
+ pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
+ pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
+ pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
pme->nkx = ir->nkx;
pme->nky = ir->nky;
pme->nkz = ir->nkz;
pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
pme->pme_order = ir->pme_order;
+
+ /* Always constant electrostatics coefficients */
pme->epsilon_r = ir->epsilon_r;
+ /* Always constant LJ coefficients */
+ pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
/* If we violate restrictions, generate a fatal error here */
gmx_pme_check_restrictions(pme->pme_order,
pme->nkx, pme->nky, pme->nkz,
MPI_Type_commit(&(pme->rvec_mpi));
#endif
- /* Note that the charge spreading and force gathering, which usually
+ /* Note that the coefficient spreading and force gathering, which usually
* takes about the same amount of time as FFT+solve_pme,
* is always fully load balanced
- * (unless the charge distribution is inhomogeneous).
+ * (unless the coefficient distribution is inhomogeneous).
*/
imbal = pme_load_imbalance(pme);
"\n"
"NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
" For optimal PME load balancing\n"
- " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
- " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+ " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
"\n",
(int)((imbal-1)*100 + 0.5),
pme->nkx, pme->nky, pme->nnodes_major,
pme->pmegrid_nz_base,
&pme->nnz, &pme->fshz);
- pmegrids_init(&pme->pmegridA,
- pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
- pme->pmegrid_nz_base,
- pme->pme_order,
- pme->bUseThreads,
- pme->nthread,
- pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
- pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
-
pme->spline_work = make_pme_spline_work(pme->pme_order);
- ndata[0] = pme->nkx;
- ndata[1] = pme->nky;
- ndata[2] = pme->nkz;
-
- /* This routine will allocate the grid data to fit the FFTs */
- gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
- &pme->fftgridA, &pme->cfftgridA,
- pme->mpi_comm_d,
- bReproducible, pme->nthread);
-
- if (bFreeEnergy)
+ ndata[0] = pme->nkx;
+ ndata[1] = pme->nky;
+ ndata[2] = pme->nkz;
+ /* It doesn't matter if we allocate too many grids here,
+ * we only allocate and use the ones we need.
+ */
+ if (EVDW_PME(ir->vdwtype))
{
- pmegrids_init(&pme->pmegridB,
- pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
- pme->pmegrid_nz_base,
- pme->pme_order,
- pme->bUseThreads,
- pme->nthread,
- pme->nkx % pme->nnodes_major != 0,
- pme->nky % pme->nnodes_minor != 0);
-
- gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
- &pme->fftgridB, &pme->cfftgridB,
- pme->mpi_comm_d,
- bReproducible, pme->nthread);
+ pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
}
else
{
- pme->pmegridB.grid.grid = NULL;
- pme->fftgridB = NULL;
- pme->cfftgridB = NULL;
+ pme->ngrids = DO_Q;
+ }
+ snew(pme->fftgrid, pme->ngrids);
+ snew(pme->cfftgrid, pme->ngrids);
+ snew(pme->pfft_setup, pme->ngrids);
+
+ for (i = 0; i < pme->ngrids; ++i)
+ {
+ if ((i < DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+ bFreeEnergy_q)) ||
+ (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+ bFreeEnergy_lj ||
+ ir->ljpme_combination_rule == eljpmeLB)))
+ {
+ pmegrids_init(&pme->pmegrid[i],
+ pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->bUseThreads,
+ pme->nthread,
+ pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+ pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+ /* This routine will allocate the grid data to fit the FFTs */
+ gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
+ &pme->fftgrid[i], &pme->cfftgrid[i],
+ pme->mpi_comm_d,
+ bReproducible, pme->nthread);
+
+ }
}
if (!pme->bP3M)
pme_realloc_atomcomm_things(&pme->atc[0]);
}
+ pme->lb_buf1 = NULL;
+ pme->lb_buf2 = NULL;
+ pme->lb_buf_nalloc = 0;
+
{
int thread;
}
ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
- &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+ &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
if (ret == 0)
{
/* We can easily reuse the allocated pme grids in pme_src */
- reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+ reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
/* We would like to reuse the fft grids, but that's harder */
}
}
-static void copy_local_grid(gmx_pme_t pme,
- pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+ int grid_index, int thread, real *fftgrid)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_my, fft_mz;
pmegrid_t *pmegrid;
real *grid_th;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
static void
reduce_threadgrid_overlap(gmx_pme_t pme,
const pmegrids_t *pmegrids, int thread,
- real *fftgrid, real *commbuf_x, real *commbuf_y)
+ real *fftgrid, real *commbuf_x, real *commbuf_y,
+ int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
int fft_nx, fft_ny, fft_nz;
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
- ivec ne;
+ ivec localcopy_end;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
const real *grid_th;
real *commbuf = NULL;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
for (d = 0; d < DIM; d++)
{
- ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
- local_fft_ndata[d]);
+ /* Determine up to where our thread needs to copy from the
+ * thread-local charge spreading grid to the rank-local FFT grid.
+ * This is up to our spreading grid end minus order-1 and
+ * not beyond the local FFT grid.
+ */
+ localcopy_end[d] =
+ min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
}
offx = pmegrid->offset[XX];
/* Now loop over all the thread data blocks that contribute
* to the grid region we (our thread) are operating on.
*/
- /* Note that ffy_nx/y is equal to the number of grid points
+ /* Note that fft_nx/y is equal to the number of grid points
* between the first point of our node grid and the one of the next node.
*/
for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
}
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
+ /* Determine the end of our part of the source grid */
if (!bCommX)
{
- tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ /* Use our thread local source grid and target grid part */
+ tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
+ /* Determine the end of our part of the source grid */
if (!bCommY)
{
- ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ /* Use our thread local source grid and target grid part */
+ ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
}
else
{
+ /* Use our thread local source grid and the spreading range */
ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
}
}
pmegrid_g = &pmegrids->grid_th[fz];
oz += pmegrid_g->offset[ZZ];
- tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+ tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
if (sx == 0 && sy == 0 && sz == 0)
{
if (bCommY)
{
commbuf = commbuf_y;
- buf_my = ty1 - offy;
+ /* The y-size of the communication buffer is set by
+ * the overlap of the grid part of our local slab
+ * with the part starting at the next slab.
+ */
+ buf_my =
+ pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor+1];
if (bCommX)
{
/* We index commbuf modulo the local grid size */
}
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
pme_overlap_t *overlap;
int x, y, z, indg, indb;
/* Note that this routine is only used for forward communication.
- * Since the force gathering, unlike the charge spreading,
+ * Since the force gathering, unlike the coefficient spreading,
* can be trivially parallelized over the particles,
* the backwards process is much simpler and can use the "old"
* communication setup.
*/
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
local_fft_ndata,
local_fft_offset,
local_fft_size);
sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
recvptr = overlap->recvbuf;
+ if (debug != NULL)
+ {
+ fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+ local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+ }
+
#ifdef GMX_MPI
MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
send_id, ipulse,
if (debug != NULL)
{
- fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
}
static void spread_on_grid(gmx_pme_t pme,
pme_atomcomm_t *atc, pmegrids_t *grids,
gmx_bool bCalcSplines, gmx_bool bSpread,
- real *fftgrid)
+ real *fftgrid, gmx_bool bDoSplines, int grid_index)
{
int nthread, thread;
#ifdef PME_TIME_THREADS
/* Compute fftgrid index for all atoms,
* with help of some extra variables.
*/
- calc_interpolation_idx(pme, atc, start, end, thread);
+ calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
}
}
#ifdef PME_TIME_THREADS
if (grids->nthread == 1)
{
- /* One thread, we operate on all charges */
+ /* One thread, we operate on all coefficients */
spline->n = atc->n;
}
else
if (bCalcSplines)
{
make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
- atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+ atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
}
if (bSpread)
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_start();
#endif
- spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+ spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
if (pme->bUseThreads)
{
- copy_local_grid(pme, grids, thread, fftgrid);
+ copy_local_grid(pme, grids, grid_index, thread, fftgrid);
}
#ifdef PME_TIME_SPREAD
ct1a = omp_cyc_end(ct1a);
reduce_threadgrid_overlap(pme, grids, thread,
fftgrid,
pme->overlap[0].sendbuf,
- pme->overlap[1].sendbuf);
+ pme->overlap[1].sendbuf,
+ grid_index);
}
#ifdef PME_TIME_THREADS
c3 = omp_cyc_end(c3);
* For this communication call we need to check pme->bUseThreads
* to have all ranks communicate here, regardless of pme->nthread.
*/
- sum_fftgrid_dd(pme, fftgrid);
+ sum_fftgrid_dd(pme, fftgrid, grid_index);
}
}
{
ivec local_fft_ndata, local_fft_offset, local_fft_size;
- gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
local_fft_ndata,
local_fft_offset,
local_fft_size);
{
gmx_incons("gmx_pme_calc_energy called in parallel");
}
- if (pme->bFEP > 1)
+ if (pme->bFEP_q > 1)
{
gmx_incons("gmx_pme_calc_energy with free energy");
}
atc->pme_order = pme->pme_order;
atc->n = n;
pme_realloc_atomcomm_things(atc);
- atc->x = x;
- atc->q = q;
+ atc->x = x;
+ atc->coefficient = q;
/* We only use the A-charges grid */
- grid = &pme->pmegridA;
+ grid = &pme->pmegrid[PME_GRID_QA];
/* Only calculate the spline coefficients, don't actually spread */
- spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+ spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
*V = gather_energy_bsplines(pme, grid->grid.grid, atc);
}
static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
t_nrnb *nrnb, t_inputrec *ir,
- gmx_large_int_t step)
+ gmx_int64_t step)
{
/* Reset all the counters related to performance over the run */
wallcycle_stop(wcycle, ewcRUN);
*pme_ret = (*pmedata)[ind];
}
-
int gmx_pmeonly(gmx_pme_t pme,
- t_commrec *cr, t_nrnb *nrnb,
+ t_commrec *cr, t_nrnb *mynrnb,
gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
- real ewaldcoeff,
+ real ewaldcoeff_q, real ewaldcoeff_lj,
t_inputrec *ir)
{
int npmedata;
matrix box;
rvec *x_pp = NULL, *f_pp = NULL;
real *chargeA = NULL, *chargeB = NULL;
- real lambda = 0;
+ real *c6A = NULL, *c6B = NULL;
+ real *sigmaA = NULL, *sigmaB = NULL;
+ real lambda_q = 0;
+ real lambda_lj = 0;
int maxshift_x = 0, maxshift_y = 0;
- real energy, dvdlambda;
- matrix vir;
+ real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
+ matrix vir_q, vir_lj;
float cycles;
int count;
gmx_bool bEnerVir;
- gmx_large_int_t step, step_rel;
+ int pme_flags;
+ gmx_int64_t step, step_rel;
ivec grid_switch;
/* This data will only use with PME tuning, i.e. switching PME grids */
pme_pp = gmx_pme_pp_init(cr);
- init_nrnb(nrnb);
+ init_nrnb(mynrnb);
count = 0;
do /****** this is a quasi-loop over time steps! */
do
{
/* Domain decomposition */
- ret = gmx_pme_recv_q_x(pme_pp,
- &natoms,
- &chargeA, &chargeB, box, &x_pp, &f_pp,
- &maxshift_x, &maxshift_y,
- &pme->bFEP, &lambda,
- &bEnerVir,
- &step,
- grid_switch, &ewaldcoeff);
+ ret = gmx_pme_recv_coeffs_coords(pme_pp,
+ &natoms,
+ &chargeA, &chargeB,
+ &c6A, &c6B,
+ &sigmaA, &sigmaB,
+ box, &x_pp, &f_pp,
+ &maxshift_x, &maxshift_y,
+ &pme->bFEP_q, &pme->bFEP_lj,
+ &lambda_q, &lambda_lj,
+ &bEnerVir,
+ &pme_flags,
+ &step,
+ grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
if (ret == pmerecvqxSWITCHGRID)
{
if (ret == pmerecvqxRESETCOUNTERS)
{
/* Reset the cycle and flop counters */
- reset_pmeonly_counters(wcycle, walltime_accounting, nrnb, ir, step);
+ reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
}
}
while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
wallcycle_start(wcycle, ewcPMEMESH);
- dvdlambda = 0;
- clear_mat(vir);
- gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
- cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
- &energy, lambda, &dvdlambda,
- GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+ dvdlambda_q = 0;
+ dvdlambda_lj = 0;
+ clear_mat(vir_q);
+ clear_mat(vir_lj);
+
+ gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
+ chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
+ cr, maxshift_x, maxshift_y, mynrnb, wcycle,
+ vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
+ &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
+ pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
cycles = wallcycle_stop(wcycle, ewcPMEMESH);
gmx_pme_send_force_vir_ener(pme_pp,
- f_pp, vir, energy, dvdlambda,
- cycles);
+ f_pp, vir_q, energy_q, vir_lj, energy_lj,
+ dvdlambda_q, dvdlambda_lj, cycles);
count++;
} /***** end of quasi-loop, we stop with the break above */
return 0;
}
+static void
+calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
+{
+ int i;
+
+ for (i = 0; i < pme->atc[0].n; ++i)
+ {
+ real sigma4;
+
+ sigma4 = local_sigma[i];
+ sigma4 = sigma4*sigma4;
+ sigma4 = sigma4*sigma4;
+ pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
+ }
+}
+
+static void
+calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
+{
+ int i;
+
+ for (i = 0; i < pme->atc[0].n; ++i)
+ {
+ pme->atc[0].coefficient[i] *= local_sigma[i];
+ }
+}
+
+static void
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+ gmx_bool bFirst, rvec x[], real *data)
+{
+ int d;
+ pme_atomcomm_t *atc;
+ atc = &pme->atc[0];
+
+ for (d = pme->ndecompdim - 1; d >= 0; d--)
+ {
+ int n_d;
+ rvec *x_d;
+ real *param_d;
+
+ if (d == pme->ndecompdim - 1)
+ {
+ n_d = homenr;
+ x_d = x + start;
+ param_d = data;
+ }
+ else
+ {
+ n_d = pme->atc[d + 1].n;
+ x_d = atc->x;
+ param_d = atc->coefficient;
+ }
+ atc = &pme->atc[d];
+ atc->npd = n_d;
+ if (atc->npd > atc->pd_nalloc)
+ {
+ atc->pd_nalloc = over_alloc_dd(atc->npd);
+ srenew(atc->pd, atc->pd_nalloc);
+ }
+ pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+ where();
+ /* Redistribute x (only once) and qA/c6A or qB/c6B */
+ if (DOMAINDECOMP(cr))
+ {
+ dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
+ }
+ }
+}
+
int gmx_pme_do(gmx_pme_t pme,
int start, int homenr,
rvec x[], rvec f[],
real *chargeA, real *chargeB,
+ real *c6A, real *c6B,
+ real *sigmaA, real *sigmaB,
matrix box, t_commrec *cr,
int maxshift_x, int maxshift_y,
t_nrnb *nrnb, gmx_wallcycle_t wcycle,
- matrix vir, real ewaldcoeff,
- real *energy, real lambda,
- real *dvdlambda, int flags)
+ matrix vir_q, real ewaldcoeff_q,
+ matrix vir_lj, real ewaldcoeff_lj,
+ real *energy_q, real *energy_lj,
+ real lambda_q, real lambda_lj,
+ real *dvdlambda_q, real *dvdlambda_lj,
+ int flags)
{
- int q, d, i, j, ntot, npme;
+ int d, i, j, k, ntot, npme, grid_index, max_grid_index;
int nx, ny, nz;
int n_d, local_ny;
pme_atomcomm_t *atc = NULL;
real *grid = NULL;
real *ptr;
rvec *x_d, *f_d;
- real *charge = NULL, *q_d;
- real energy_AB[2];
- matrix vir_AB[2];
+ real *coefficient = NULL;
+ real energy_AB[4];
+ matrix vir_AB[4];
+ real scale, lambda;
gmx_bool bClearF;
gmx_parallel_3dfft_t pfft_setup;
real * fftgrid;
t_complex * cfftgrid;
int thread;
+ gmx_bool bFirst, bDoSplines;
+ int fep_state;
+ int fep_states_lj = pme->bFEP_lj ? 2 : 1;
const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
atc->pd_nalloc = over_alloc_dd(atc->npd);
srenew(atc->pd, atc->pd_nalloc);
}
- atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ for (d = pme->ndecompdim-1; d >= 0; d--)
+ {
+ atc = &pme->atc[d];
+ atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+ }
}
else
{
+ atc = &pme->atc[0];
/* This could be necessary for TPI */
pme->atc[0].n = homenr;
+ if (DOMAINDECOMP(cr))
+ {
+ pme_realloc_atomcomm_things(atc);
+ }
+ atc->x = x;
+ atc->f = f;
}
- for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+ m_inv_ur0(box, pme->recipbox);
+ bFirst = TRUE;
+
+ /* For simplicity, we construct the splines for all particles if
+ * more than one PME calculations is needed. Some optimization
+ * could be done by keeping track of which atoms have splines
+ * constructed, and construct new splines on each pass for atoms
+ * that don't yet have them.
+ */
+
+ bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
+
+ /* We need a maximum of four separate PME calculations:
+ * grid_index=0: Coulomb PME with charges from state A
+ * grid_index=1: Coulomb PME with charges from state B
+ * grid_index=2: LJ PME with C6 from state A
+ * grid_index=3: LJ PME with C6 from state B
+ * For Lorentz-Berthelot combination rules, a separate loop is used to
+ * calculate all the terms
+ */
+
+ /* If we are doing LJ-PME with LB, we only do Q here */
+ max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
+
+ for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
{
- if (q == 0)
+ /* Check if we should do calculations at this grid_index
+ * If grid_index is odd we should be doing FEP
+ * If grid_index < 2 we should be doing electrostatic PME
+ * If grid_index >= 2 we should be doing LJ-PME
+ */
+ if ((grid_index < DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+ (grid_index == 1 && !pme->bFEP_q))) ||
+ (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+ (grid_index == 3 && !pme->bFEP_lj))))
{
- pmegrid = &pme->pmegridA;
- fftgrid = pme->fftgridA;
- cfftgrid = pme->cfftgridA;
- pfft_setup = pme->pfft_setupA;
- charge = chargeA+start;
+ continue;
}
- else
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ switch (grid_index)
{
- pmegrid = &pme->pmegridB;
- fftgrid = pme->fftgridB;
- cfftgrid = pme->cfftgridB;
- pfft_setup = pme->pfft_setupB;
- charge = chargeB+start;
+ case 0: coefficient = chargeA + start; break;
+ case 1: coefficient = chargeB + start; break;
+ case 2: coefficient = c6A + start; break;
+ case 3: coefficient = c6B + start; break;
}
+
grid = pmegrid->grid.grid;
- /* Unpack structure */
+
if (debug)
{
- fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+ fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
cr->nnodes, cr->nodeid);
fprintf(debug, "Grid = %p\n", (void*)grid);
if (grid == NULL)
}
where();
- m_inv_ur0(box, pme->recipbox);
-
if (pme->nnodes == 1)
{
- atc = &pme->atc[0];
- if (DOMAINDECOMP(cr))
- {
- atc->n = homenr;
- pme_realloc_atomcomm_things(atc);
- }
- atc->x = x;
- atc->q = charge;
- atc->f = f;
+ atc->coefficient = coefficient;
}
else
{
wallcycle_start(wcycle, ewcPME_REDISTXF);
- for (d = pme->ndecompdim-1; d >= 0; d--)
- {
- if (d == pme->ndecompdim-1)
- {
- n_d = homenr;
- x_d = x + start;
- q_d = charge;
- }
- else
- {
- n_d = pme->atc[d+1].n;
- x_d = atc->x;
- q_d = atc->q;
- }
- atc = &pme->atc[d];
- atc->npd = n_d;
- if (atc->npd > atc->pd_nalloc)
- {
- atc->pd_nalloc = over_alloc_dd(atc->npd);
- srenew(atc->pd, atc->pd_nalloc);
- }
- atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
- pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
- where();
-
- /* Redistribute x (only once) and qA or qB */
- if (DOMAINDECOMP(cr))
- {
- dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
- }
- else
- {
- pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
- }
- }
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
where();
wallcycle_stop(wcycle, ewcPME_REDISTXF);
if (debug)
{
- fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+ fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
cr->nodeid, atc->n);
}
- if (flags & GMX_PME_SPREAD_Q)
+ if (flags & GMX_PME_SPREAD)
{
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
- /* Spread the charges on a grid */
- spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+ /* Spread the coefficients on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
- if (q == 0)
+ if (bFirst)
{
inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
}
- inc_nrnb(nrnb, eNR_SPREADQBSP,
+ inc_nrnb(nrnb, eNR_SPREADBSP,
pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
if (!pme->bUseThreads)
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
where();
}
#endif
- copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
/* solve in k-space for our local cells */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_SOLVE);
+ wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ }
+ if (grid_index < DO_Q)
+ {
+ loop_count =
+ solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
}
- loop_count =
- solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
- box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
- bCalcEnerVir,
- pme->nthread, thread);
+ else
+ {
+ loop_count =
+ solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
+ }
+
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_SOLVE);
+ wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
where();
inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
}
inc_nrnb(nrnb, eNR_FFT, 2*npme);
}
+ /* Note: this wallcycle region is closed below
+ outside an OpenMP region, so take care if
+ refactoring code here. */
wallcycle_start(wcycle, ewcPME_SPREADGATHER);
}
- copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
}
}
/* End of thread parallel section.
#ifdef GMX_MPI
if (pme->nnodes > 1)
{
- gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
}
#endif
where();
* atc->f is the actual force array, not a buffer,
* therefore we should not clear it.
*/
- bClearF = (q == 0 && PAR(cr));
+ lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
+ bClearF = (bFirst && PAR(cr));
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
for (thread = 0; thread < pme->nthread; thread++)
{
gather_f_bsplines(pme, grid, bClearF, atc,
&atc->spline[thread],
- pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+ pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
}
where();
inc_nrnb(nrnb, eNR_GATHERFBSP,
pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ /* Note: this wallcycle region is opened above inside an OpenMP
+ region, so take care if refactoring code here. */
wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
}
/* This should only be called on the master thread
* and after the threads have synchronized.
*/
- get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+ if (grid_index < 2)
+ {
+ get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+ }
+ else
+ {
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+ }
}
- } /* of q-loop */
+ bFirst = FALSE;
+ } /* of grid_index-loop */
+
+ /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
+ * seven terms. */
+
+ if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
+ {
+ /* Loop over A- and B-state if we are doing FEP */
+ for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
+ {
+ real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+ if (pme->nnodes == 1)
+ {
+ if (pme->lb_buf1 == NULL)
+ {
+ pme->lb_buf_nalloc = pme->atc[0].n;
+ snew(pme->lb_buf1, pme->lb_buf_nalloc);
+ }
+ pme->atc[0].coefficient = pme->lb_buf1;
+ switch (fep_state)
+ {
+ case 0:
+ local_c6 = c6A;
+ local_sigma = sigmaA;
+ break;
+ case 1:
+ local_c6 = c6B;
+ local_sigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
+ }
+ else
+ {
+ atc = &pme->atc[0];
+ switch (fep_state)
+ {
+ case 0:
+ RedistC6 = c6A;
+ RedistSigma = sigmaA;
+ break;
+ case 1:
+ RedistC6 = c6B;
+ RedistSigma = sigmaB;
+ break;
+ default:
+ gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+ }
+ wallcycle_start(wcycle, ewcPME_REDISTXF);
+
+ do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+ if (pme->lb_buf_nalloc < atc->n)
+ {
+ pme->lb_buf_nalloc = atc->nalloc;
+ srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+ srenew(pme->lb_buf2, pme->lb_buf_nalloc);
+ }
+ local_c6 = pme->lb_buf1;
+ for (i = 0; i < atc->n; ++i)
+ {
+ local_c6[i] = atc->coefficient[i];
+ }
+ where();
+
+ do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+ local_sigma = pme->lb_buf2;
+ for (i = 0; i < atc->n; ++i)
+ {
+ local_sigma[i] = atc->coefficient[i];
+ }
+ where();
+
+ wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ }
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+
+ /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+ for (grid_index = 2; grid_index < 9; ++grid_index)
+ {
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ calc_next_lb_coeffs(pme, local_sigma);
+ grid = pmegrid->grid.grid;
+ where();
+
+ if (flags & GMX_PME_SPREAD)
+ {
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ /* Spread the c6 on a grid */
+ spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+ if (bFirst)
+ {
+ inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+ }
+
+ inc_nrnb(nrnb, eNR_SPREADBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+ if (pme->nthread == 1)
+ {
+ wrap_periodic_pmegrid(pme, grid);
+ /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+ where();
+ }
+#endif
+ copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
+ }
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+ }
+ /*Here we start a large thread parallel region*/
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ if (flags & GMX_PME_SOLVE)
+ {
+ /* do 3d-fft */
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+ }
+ where();
+ }
+ }
+ bFirst = FALSE;
+ }
+ if (flags & GMX_PME_SOLVE)
+ {
+ /* solve in k-space for our local cells */
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ int loop_count;
+ thread = gmx_omp_get_thread_num();
+ if (thread == 0)
+ {
+ wallcycle_start(wcycle, ewcLJPME);
+ }
+
+ loop_count =
+ solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ bCalcEnerVir,
+ pme->nthread, thread);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcLJPME);
+ where();
+ inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+ }
+ }
+ }
+
+ if (bCalcEnerVir)
+ {
+ /* This should only be called on the master thread and
+ * after the threads have synchronized.
+ */
+ get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+ }
+
+ if (bCalcF)
+ {
+ bFirst = !(flags & GMX_PME_DO_COULOMB);
+ calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+ for (grid_index = 8; grid_index >= 2; --grid_index)
+ {
+ /* Unpack structure */
+ pmegrid = &pme->pmegrid[grid_index];
+ fftgrid = pme->fftgrid[grid_index];
+ cfftgrid = pme->cfftgrid[grid_index];
+ pfft_setup = pme->pfft_setup[grid_index];
+ grid = pmegrid->grid.grid;
+ calc_next_lb_coeffs(pme, local_sigma);
+ where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+ {
+ thread = gmx_omp_get_thread_num();
+ /* do 3d-invfft */
+ if (thread == 0)
+ {
+ where();
+ wallcycle_start(wcycle, ewcPME_FFT);
+ }
+
+ gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+ thread, wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle, ewcPME_FFT);
+
+ where();
+
+ if (pme->nodeid == 0)
+ {
+ ntot = pme->nkx*pme->nky*pme->nkz;
+ npme = ntot*log((real)ntot)/log(2.0);
+ inc_nrnb(nrnb, eNR_FFT, 2*npme);
+ }
+ wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+ }
+
+ copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+ } /*#pragma omp parallel*/
+
+ /* distribute local grid to all nodes */
+#ifdef GMX_MPI
+ if (pme->nnodes > 1)
+ {
+ gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+ }
+#endif
+ where();
+
+ unwrap_periodic_pmegrid(pme, grid);
+
+ /* interpolate forces for our local atoms */
+ where();
+ bClearF = (bFirst && PAR(cr));
+ scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+ scale *= lb_scale_factor[grid_index-2];
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (thread = 0; thread < pme->nthread; thread++)
+ {
+ gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+ &pme->atc[0].spline[thread],
+ scale);
+ }
+ where();
+
+ inc_nrnb(nrnb, eNR_GATHERFBSP,
+ pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+ wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+
+ bFirst = FALSE;
+ } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+ } /* if (bCalcF) */
+ } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+ } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
if (bCalcF && pme->nnodes > 1)
{
dd_pmeredist_f(pme, atc, n_d, f_d,
d == pme->ndecompdim-1 && pme->bPPnode);
}
- else
- {
- pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
- }
}
wallcycle_stop(wcycle, ewcPME_REDISTXF);
if (bCalcEnerVir)
{
- if (!pme->bFEP)
+ if (flags & GMX_PME_DO_COULOMB)
{
- *energy = energy_AB[0];
- m_add(vir, vir_AB[0], vir);
+ if (!pme->bFEP_q)
+ {
+ *energy_q = energy_AB[0];
+ m_add(vir_q, vir_AB[0], vir_q);
+ }
+ else
+ {
+ *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
+ *dvdlambda_q += energy_AB[1] - energy_AB[0];
+ for (i = 0; i < DIM; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
+ lambda_q*vir_AB[1][i][j];
+ }
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
+ }
}
else
{
- *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
- *dvdlambda += energy_AB[1] - energy_AB[0];
- for (i = 0; i < DIM; i++)
+ *energy_q = 0;
+ }
+
+ if (flags & GMX_PME_DO_LJ)
+ {
+ if (!pme->bFEP_lj)
+ {
+ *energy_lj = energy_AB[2];
+ m_add(vir_lj, vir_AB[2], vir_lj);
+ }
+ else
{
- for (j = 0; j < DIM; j++)
+ *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
+ *dvdlambda_lj += energy_AB[3] - energy_AB[2];
+ for (i = 0; i < DIM; i++)
{
- vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
- lambda*vir_AB[1][i][j];
+ for (j = 0; j < DIM; j++)
+ {
+ vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
+ }
}
}
+ if (debug)
+ {
+ fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
+ }
+ }
+ else
+ {
+ *energy_lj = 0;
}
}
- else
- {
- *energy = 0;
- }
-
- if (debug)
- {
- fprintf(debug, "PME mesh energy: %g\n", *energy);
- }
-
return 0;
}