-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
*
- *
- * This source code is part of
- *
- * G R O M A C S
- *
- * GROningen MAchine for Chemical Simulations
- *
- * VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
*
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
*
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "config.h"
+#include <assert.h>
#include <math.h>
#include <string.h>
-#include <assert.h>
-#include "sysstuff.h"
-#include "typedefs.h"
-#include "vec.h"
-#include "maths.h"
-#include "macros.h"
-#include "smalloc.h"
-#include "macros.h"
-#include "gmx_fatal.h"
-#include "gmx_fatal_collective.h"
-#include "physics.h"
-#include "force.h"
-#include "tables.h"
-#include "nonbonded.h"
-#include "invblock.h"
-#include "names.h"
-#include "network.h"
-#include "pbc.h"
-#include "ns.h"
-#include "mshift.h"
-#include "txtdump.h"
-#include "coulomb.h"
-#include "md_support.h"
-#include "md_logging.h"
-#include "domdec.h"
-#include "partdec.h"
-#include "qmmm.h"
-#include "copyrite.h"
-#include "mtop_util.h"
-#include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_consts.h"
-#include "statutil.h"
-#include "gmx_omp_nthreads.h"
-#include "gmx_detect_hardware.h"
-
-#ifdef _MSC_VER
-/* MSVC definition for __cpuid() */
-#include <intrin.h>
-#endif
-#include "types/nbnxn_cuda_types_ext.h"
-#include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "pmalloc_cuda.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/gpu_utils.h"
+#include "gromacs/legacyheaders/inputrec.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nonbonded.h"
+#include "gromacs/legacyheaders/ns.h"
+#include "gromacs/legacyheaders/pmalloc_cuda.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/tables.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
t_forcerec *mk_forcerec(void)
{
return nbfp;
}
+static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
+{
+ int i, j, k, atnr;
+ real c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ real *grid;
+
+ /* For LJ-PME simulations, we correct the energies with the reciprocal space
+ * inside of the cut-off. To do this the non-bonded kernels needs to have
+ * access to the C6-values used on the reciprocal grid in pme.c
+ */
+
+ atnr = idef->atnr;
+ snew(grid, 2*atnr*atnr);
+ for (i = k = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++, k++)
+ {
+ c6i = idef->iparams[i*(atnr+1)].lj.c6;
+ c12i = idef->iparams[i*(atnr+1)].lj.c12;
+ c6j = idef->iparams[j*(atnr+1)].lj.c6;
+ c12j = idef->iparams[j*(atnr+1)].lj.c12;
+ c6 = sqrt(c6i * c6j);
+ if (fr->ljpme_combination_rule == eljpmeLB
+ && !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
+ {
+ sigmai = pow(c12i / c6i, 1.0/6.0);
+ sigmaj = pow(c12j / c6j, 1.0/6.0);
+ epsi = c6i * c6i / c12i;
+ epsj = c6j * c6j / c12j;
+ c6 = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
+ }
+ /* Store the elements at the same relative positions as C6 in nbfp in order
+ * to simplify access in the kernels
+ */
+ grid[2*(atnr*i+j)] = c6*6.0;
+ }
+ }
+ return grid;
+}
+
+static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
+{
+ real *nbfp;
+ int i, j, k, atnr;
+ real c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ real c6, c12;
+
+ atnr = idef->atnr;
+ snew(nbfp, 2*atnr*atnr);
+ for (i = 0; i < atnr; ++i)
+ {
+ for (j = 0; j < atnr; ++j)
+ {
+ c6i = idef->iparams[i*(atnr+1)].lj.c6;
+ c12i = idef->iparams[i*(atnr+1)].lj.c12;
+ c6j = idef->iparams[j*(atnr+1)].lj.c6;
+ c12j = idef->iparams[j*(atnr+1)].lj.c12;
+ c6 = sqrt(c6i * c6j);
+ c12 = sqrt(c12i * c12j);
+ if (comb_rule == eCOMB_ARITHMETIC
+ && !gmx_numzero(c6) && !gmx_numzero(c12))
+ {
+ sigmai = pow(c12i / c6i, 1.0/6.0);
+ sigmaj = pow(c12j / c6j, 1.0/6.0);
+ epsi = c6i * c6i / c12i;
+ epsj = c6j * c6j / c12j;
+ c6 = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
+ c12 = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 12);
+ }
+ C6(nbfp, atnr, i, j) = c6*6.0;
+ C12(nbfp, atnr, i, j) = c12*12.0;
+ }
+ }
+ return nbfp;
+}
+
/* This routine sets fr->solvent_opt to the most common solvent in the
* system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
* the fr->solvent_type array with the correct type (or esolNO).
int cginfo,
int *cg_sp)
{
- const t_blocka * excl;
+ const t_blocka *excl;
t_atom *atom;
int j, k;
int j0, j1, nj;
gmx_bool perturbed;
gmx_bool has_vdw[4];
gmx_bool match;
- real tmp_charge[4];
- int tmp_vdwtype[4];
+ real tmp_charge[4] = { 0.0 }; /* init to zero to make gcc4.8 happy */
+ int tmp_vdwtype[4] = { 0 }; /* init to zero to make gcc4.8 happy */
int tjA;
gmx_bool qm;
solvent_parameters_t *solvent_parameters;
bestsol = esolNO;
}
-#ifdef DISABLE_WATER_NLIST
- bestsol = esolNO;
-#endif
-
fr->nWatMol = 0;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
t_forcerec *fr, gmx_bool bNoSolvOpt,
+ gmx_bool *bFEP_NonBonded,
gmx_bool *bExcl_IntraCGAll_InterCGNone)
{
const t_block *cgs;
int *a_con;
int ftype;
int ia;
- gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+ gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bHavePerturbedAtoms;
ncg_tot = ncg_mtop(mtop);
snew(cginfo_mb, mtop->nmolblock);
}
}
+ *bFEP_NonBonded = FALSE;
*bExcl_IntraCGAll_InterCGNone = TRUE;
excl_nalloc = 10;
/* bExclIntraAll: all intra cg interactions excluded
* bExclInter: any inter cg interactions excluded
*/
- bExclIntraAll = TRUE;
- bExclInter = FALSE;
- bHaveVDW = FALSE;
- bHaveQ = FALSE;
+ bExclIntraAll = TRUE;
+ bExclInter = FALSE;
+ bHaveVDW = FALSE;
+ bHaveQ = FALSE;
+ bHavePerturbedAtoms = FALSE;
for (ai = a0; ai < a1; ai++)
{
/* Check VDW and electrostatic interactions */
bHaveQ = bHaveQ || (molt->atoms.atom[ai].q != 0 ||
molt->atoms.atom[ai].qB != 0);
+ bHavePerturbedAtoms = bHavePerturbedAtoms || (PERTURBED(molt->atoms.atom[ai]) != 0);
+
/* Clear the exclusion list for atom ai */
for (aj = a0; aj < a1; aj++)
{
{
SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
}
+ if (bHavePerturbedAtoms && fr->efep != efepNO)
+ {
+ SET_CGINFO_FEP(cginfo[cgm+cg]);
+ *bFEP_NonBonded = TRUE;
+ }
/* Store the charge group size */
SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
{
- double qsum, q2sum, q;
+ /*This now calculates sum for q and c6*/
+ double qsum, q2sum, q, c6sum, c6;
int mb, nmol, i;
const t_atoms *atoms;
- qsum = 0;
- q2sum = 0;
+ qsum = 0;
+ q2sum = 0;
+ c6sum = 0;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
nmol = mtop->molblock[mb].nmol;
atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
for (i = 0; i < atoms->nr; i++)
{
- q = atoms->atom[i].q;
- qsum += nmol*q;
- q2sum += nmol*q*q;
+ q = atoms->atom[i].q;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ c6 = mtop->ffparams.iparams[atoms->atom[i].type*(mtop->ffparams.atnr+1)].lj.c6;
+ c6sum += nmol*c6;
}
}
- fr->qsum[0] = qsum;
- fr->q2sum[0] = q2sum;
+ fr->qsum[0] = qsum;
+ fr->q2sum[0] = q2sum;
+ fr->c6sum[0] = c6sum;
+
if (fr->efep != efepNO)
{
- qsum = 0;
- q2sum = 0;
+ qsum = 0;
+ q2sum = 0;
+ c6sum = 0;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
nmol = mtop->molblock[mb].nmol;
atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
for (i = 0; i < atoms->nr; i++)
{
- q = atoms->atom[i].qB;
- qsum += nmol*q;
- q2sum += nmol*q*q;
+ q = atoms->atom[i].qB;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ c6 = mtop->ffparams.iparams[atoms->atom[i].typeB*(mtop->ffparams.atnr+1)].lj.c6;
+ c6sum += nmol*c6;
}
- fr->qsum[1] = qsum;
- fr->q2sum[1] = q2sum;
+ fr->qsum[1] = qsum;
+ fr->q2sum[1] = q2sum;
+ fr->c6sum[1] = c6sum;
}
}
else
{
- fr->qsum[1] = fr->qsum[0];
- fr->q2sum[1] = fr->q2sum[0];
+ fr->qsum[1] = fr->qsum[0];
+ fr->q2sum[1] = fr->q2sum[0];
+ fr->c6sum[1] = fr->c6sum[0];
}
if (log)
{
const t_atoms *atoms, *atoms_tpi;
const t_blocka *excl;
int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
-#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
- long long int npair, npair_ij, tmpi, tmpj;
-#else
- double npair, npair_ij, tmpi, tmpj;
-#endif
+ gmx_int64_t npair, npair_ij, tmpi, tmpj;
double csix, ctwelve;
int ntp, *typecount;
gmx_bool bBHAM;
real *nbfp;
+ real *nbfp_comb = NULL;
ntp = fr->ntype;
bBHAM = fr->bBHAM;
nbfp = fr->nbfp;
+ /* For LJ-PME, we want to correct for the difference between the
+ * actual C6 values and the C6 values used by the LJ-PME based on
+ * combination rules. */
+
+ if (EVDW_PME(fr->vdwtype))
+ {
+ nbfp_comb = mk_nbfp_combination_rule(&mtop->ffparams,
+ (fr->ljpme_combination_rule == eljpmeLB) ? eCOMB_ARITHMETIC : eCOMB_GEOMETRIC);
+ for (tpi = 0; tpi < ntp; ++tpi)
+ {
+ for (tpj = 0; tpj < ntp; ++tpj)
+ {
+ C6(nbfp_comb, ntp, tpi, tpj) =
+ C6(nbfp, ntp, tpi, tpj) - C6(nbfp_comb, ntp, tpi, tpj);
+ C12(nbfp_comb, ntp, tpi, tpj) = C12(nbfp, ntp, tpi, tpj);
+ }
+ }
+ nbfp = nbfp_comb;
+ }
for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
{
csix = 0;
{
/* Count the types so we avoid natoms^2 operations */
snew(typecount, ntp);
- for (mb = 0; mb < mtop->nmolblock; mb++)
- {
- nmol = mtop->molblock[mb].nmol;
- atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
- for (i = 0; i < atoms->nr; i++)
- {
- if (q == 0)
- {
- tpi = atoms->atom[i].type;
- }
- else
- {
- tpi = atoms->atom[i].typeB;
- }
- typecount[tpi] += nmol;
- }
- }
+ gmx_mtop_count_atomtypes(mtop, q, typecount);
+
for (tpi = 0; tpi < ntp; tpi++)
{
for (tpj = tpi; tpj < ntp; tpj++)
fr->avcsix[q] = csix;
fr->avctwelve[q] = ctwelve;
}
+
+ if (EVDW_PME(fr->vdwtype))
+ {
+ sfree(nbfp_comb);
+ }
+
if (fplog != NULL)
{
if (fr->eDispCorr == edispcAllEner ||
if (bAllvsAll && fp && MASTER(cr))
{
- fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+ fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n");
}
return bAllvsAll;
}
+gmx_bool nbnxn_acceleration_supported(FILE *fplog,
+ const t_commrec *cr,
+ const t_inputrec *ir,
+ gmx_bool bGPU)
+{
+ if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
+ {
+ md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
+ bGPU ? "GPUs" : "SIMD kernels",
+ bGPU ? "CPU only" : "plain-C kernels");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+
static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
int *kernel_type,
int *ewald_excl)
*kernel_type = nbnxnk4xN_SIMD_4xN;
#endif
#ifdef GMX_NBNXN_SIMD_2XNN
- /* We expect the 2xNN kernels to be faster in most cases */
*kernel_type = nbnxnk4xN_SIMD_2xNN;
#endif
-#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
- if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+#if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
+ /* We need to choose if we want 2x(N+N) or 4xN kernels.
+ * Currently this is based on the SIMD acceleration choice,
+ * but it might be better to decide this at runtime based on CPU.
+ *
+ * 4xN calculates more (zero) interactions, but has less pair-search
+ * work and much better kernel instruction scheduling.
+ *
+ * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
+ * which doesn't have FMA, both the analytical and tabulated Ewald
+ * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
+ * 2x(4+4) because it results in significantly fewer pairs.
+ * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT. As we currently don't detect the actual
+ * use of HT, use 4x8 to avoid a potential performance hit.
+ * On Intel Haswell 4x8 is always faster.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+
+#ifndef GMX_SIMD_HAVE_FMA
+ if (EEL_PME_EWALD(ir->coulombtype) ||
+ EVDW_PME(ir->vdwtype))
{
- /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
- * 10% with HT, 50% without HT, but extra zeros interactions
- * can compensate. As we currently don't detect the actual use
- * of HT, switch to 4x8 to avoid a potential performance hit.
+ /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
+ * There are enough instructions to make 2x(4+4) efficient.
*/
- *kernel_type = nbnxnk4xN_SIMD_4xN;
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
}
#endif
+#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+
+
if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
#ifdef GMX_NBNXN_SIMD_4XN
#endif
}
- /* Analytical Ewald exclusion correction is only an option in the
- * x86 SIMD kernel. This is faster in single precision
- * on Bulldozer and slightly faster on Sandy Bridge.
+ /* Analytical Ewald exclusion correction is only an option in
+ * the SIMD kernel.
+ * Since table lookup's don't parallelize with SIMD, analytical
+ * will probably always be faster for a SIMD width of 8 or more.
+ * With FMA analytical is sometimes faster for a width if 4 as well.
+ * On BlueGene/Q, this is faster regardless of precision.
+ * In single precision, this is faster on Bulldozer.
*/
-#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+#if GMX_SIMD_REAL_WIDTH >= 8 || \
+ (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
+ defined GMX_SIMD_IBM_QPX
*ewald_excl = ewaldexclAnalytical;
#endif
if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
}
}
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD */
}
case nbnxnk4xN_SIMD_4xN:
case nbnxnk4xN_SIMD_2xNN:
#ifdef GMX_NBNXN_SIMD
-#ifdef GMX_X86_SSE2
- /* We have x86 SSE2 compatible SIMD */
-#ifdef GMX_X86_AVX_128_FMA
- returnvalue = "AVX-128-FMA";
-#else
-#if defined GMX_X86_AVX_256 || defined __AVX__
- /* x86 SIMD intrinsics can be converted to SSE or AVX depending
- * on compiler flags. As we use nearly identical intrinsics,
- * compiling for AVX without an AVX macros effectively results
- * in AVX kernels.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
- returnvalue = "AVX-256";
-#else
- returnvalue = "AVX-128";
-#endif
-#else
-#ifdef GMX_X86_SSE4_1
- returnvalue = "SSE4.1";
+#if defined GMX_SIMD_X86_SSE2
+ returnvalue = "SSE2";
+#elif defined GMX_SIMD_X86_SSE4_1
+ returnvalue = "SSE4.1";
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+ returnvalue = "AVX_128_FMA";
+#elif defined GMX_SIMD_X86_AVX_256
+ returnvalue = "AVX_256";
+#elif defined GMX_SIMD_X86_AVX2_256
+ returnvalue = "AVX2_256";
#else
- returnvalue = "SSE2";
-#endif
-#endif
+ returnvalue = "SIMD";
#endif
-#else /* GMX_X86_SSE2 */
- /* not GMX_X86_SSE2, but other SIMD */
- returnvalue = "SIMD";
-#endif /* GMX_X86_SSE2 */
-#else /* GMX_NBNXN_SIMD */
+#else /* GMX_NBNXN_SIMD */
returnvalue = "not available";
#endif /* GMX_NBNXN_SIMD */
break;
static void pick_nbnxn_kernel(FILE *fp,
const t_commrec *cr,
- gmx_bool use_cpu_acceleration,
+ gmx_bool use_simd_kernels,
gmx_bool bUseGPU,
gmx_bool bEmulateGPU,
const t_inputrec *ir,
if (*kernel_type == nbnxnkNotSet)
{
- if (use_cpu_acceleration)
+ /* LJ PME with LB combination rule does 7 mesh operations.
+ * This so slow that we don't compile SIMD non-bonded kernels for that.
+ */
+ if (use_simd_kernels &&
+ nbnxn_acceleration_supported(fp, cr, ir, FALSE))
{
pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
}
lookup_nbnxn_kernel_name(*kernel_type),
nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
nbnxn_kernel_to_cj_size(*kernel_type));
+
+ if (nbnxnk4x4_PlainC == *kernel_type ||
+ nbnxnk8x8x8_PlainC == *kernel_type)
+ {
+ md_print_warn(cr, fp,
+ "WARNING: Using the slow %s kernels. This should\n"
+ "not happen during routine usage on supported platforms.\n\n",
+ lookup_nbnxn_kernel_name(*kernel_type));
+ }
}
}
const gmx_hw_info_t *hwinfo,
gmx_bool bDoNonbonded,
gmx_bool *bUseGPU,
- gmx_bool *bEmulateGPU)
+ gmx_bool *bEmulateGPU,
+ const gmx_gpu_opt_t *gpu_opt)
{
gmx_bool bEmulateGPUEnvVarSet;
char gpu_err_str[STRLEN];
* Note that you should freezing the system as otherwise it will explode.
*/
*bEmulateGPU = (bEmulateGPUEnvVarSet ||
- (!bDoNonbonded && hwinfo->bCanUseGPU));
+ (!bDoNonbonded &&
+ gpu_opt->ncuda_dev_use > 0));
/* Enable GPU mode when GPUs are available or no GPU emulation is requested.
*/
- if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
{
/* Each PP node will use the intra-node id-th device from the
* list of detected/selected GPUs. */
- if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+ &hwinfo->gpu_info, gpu_opt))
{
/* At this point the init should never fail as we made sure that
* we have all the GPUs we need. If it still does, we'll bail. */
- gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+ gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
cr->nodeid,
- get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+ cr->rank_pp_intranode),
gpu_err_str);
}
if (bUsesSimpleTables)
{
- /* With a spacing of 0.0005 we are at the force summation accuracy
- * for the SSE kernels for "normal" atomistic simulations.
+ /* Get the Ewald table spacing based on Coulomb and/or LJ
+ * Ewald coefficients and rtol.
*/
- ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
- ic->rcoulomb);
+ ic->tabq_scale = ewald_spline3_table_scale(ic);
maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2;
sfree_aligned(ic->tabq_coul_F);
sfree_aligned(ic->tabq_coul_V);
- /* Create the original table data in FDV0 */
- snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
- snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
- snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
- table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
- ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+ sfree_aligned(ic->tabq_vdw_FDV0);
+ sfree_aligned(ic->tabq_vdw_F);
+ sfree_aligned(ic->tabq_vdw_V);
+
+ if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ {
+ /* Create the original table data in FDV0 */
+ snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+ snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+ snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+ table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+ ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_q, v_q_ewald_lr);
+ }
+
+ if (EVDW_PME(ic->vdwtype))
+ {
+ snew_aligned(ic->tabq_vdw_FDV0, ic->tabq_size*4, 32);
+ snew_aligned(ic->tabq_vdw_F, ic->tabq_size, 32);
+ snew_aligned(ic->tabq_vdw_V, ic->tabq_size, 32);
+ table_spline3_fill_ewald_lr(ic->tabq_vdw_F, ic->tabq_vdw_V, ic->tabq_vdw_FDV0,
+ ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_lj, v_lj_ewald_lr);
+ }
}
void init_interaction_const_tables(FILE *fp,
{
real spacing;
- if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
{
init_ewald_f_table(ic, bUsesSimpleTables, rtab);
}
}
-void init_interaction_const(FILE *fp,
- interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab)
+static void clear_force_switch_constants(shift_consts_t *sc)
+{
+ sc->c2 = 0;
+ sc->c3 = 0;
+ sc->cpot = 0;
+}
+
+static void force_switch_constants(real p,
+ real rsw, real rc,
+ shift_consts_t *sc)
+{
+ /* Here we determine the coefficient for shifting the force to zero
+ * between distance rsw and the cut-off rc.
+ * For a potential of r^-p, we have force p*r^-(p+1).
+ * But to save flops we absorb p in the coefficient.
+ * Thus we get:
+ * force/p = r^-(p+1) + c2*r^2 + c3*r^3
+ * potential = r^-p + c2/3*r^3 + c3/4*r^4 + cpot
+ */
+ sc->c2 = ((p + 1)*rsw - (p + 4)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 2));
+ sc->c3 = -((p + 1)*rsw - (p + 3)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 3));
+ sc->cpot = -pow(rc, -p) + p*sc->c2/3*pow(rc - rsw, 3) + p*sc->c3/4*pow(rc - rsw, 4);
+}
+
+static void potential_switch_constants(real rsw, real rc,
+ switch_consts_t *sc)
+{
+ /* The switch function is 1 at rsw and 0 at rc.
+ * The derivative and second derivate are zero at both ends.
+ * rsw = max(r - r_switch, 0)
+ * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force = force*dsw - potential*sw
+ * potential *= sw
+ */
+ sc->c3 = -10*pow(rc - rsw, -3);
+ sc->c4 = 15*pow(rc - rsw, -4);
+ sc->c5 = -6*pow(rc - rsw, -5);
+}
+
+static void
+init_interaction_const(FILE *fp,
+ const t_commrec gmx_unused *cr,
+ interaction_const_t **interaction_const,
+ const t_forcerec *fr,
+ real rtab)
{
interaction_const_t *ic;
gmx_bool bUsesSimpleTables = TRUE;
snew_aligned(ic->tabq_coul_F, 16, 32);
snew_aligned(ic->tabq_coul_V, 16, 32);
- ic->rlist = fr->rlist;
- ic->rlistlong = fr->rlistlong;
+ ic->rlist = fr->rlist;
+ ic->rlistlong = fr->rlistlong;
/* Lennard-Jones */
- ic->rvdw = fr->rvdw;
- if (fr->vdw_modifier == eintmodPOTSHIFT)
- {
- ic->sh_invrc6 = pow(ic->rvdw, -6.0);
- }
- else
- {
- ic->sh_invrc6 = 0;
+ ic->vdwtype = fr->vdwtype;
+ ic->vdw_modifier = fr->vdw_modifier;
+ ic->rvdw = fr->rvdw;
+ ic->rvdw_switch = fr->rvdw_switch;
+ ic->ewaldcoeff_lj = fr->ewaldcoeff_lj;
+ ic->ljpme_comb_rule = fr->ljpme_combination_rule;
+ ic->sh_lj_ewald = 0;
+ clear_force_switch_constants(&ic->dispersion_shift);
+ clear_force_switch_constants(&ic->repulsion_shift);
+
+ switch (ic->vdw_modifier)
+ {
+ case eintmodPOTSHIFT:
+ /* Only shift the potential, don't touch the force */
+ ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
+ ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0);
+ if (EVDW_PME(ic->vdwtype))
+ {
+ real crc2;
+
+ crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw);
+ ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+ }
+ break;
+ case eintmodFORCESWITCH:
+ /* Switch the force, switch and shift the potential */
+ force_switch_constants(6.0, ic->rvdw_switch, ic->rvdw,
+ &ic->dispersion_shift);
+ force_switch_constants(12.0, ic->rvdw_switch, ic->rvdw,
+ &ic->repulsion_shift);
+ break;
+ case eintmodPOTSWITCH:
+ /* Switch the potential and force */
+ potential_switch_constants(ic->rvdw_switch, ic->rvdw,
+ &ic->vdw_switch);
+ break;
+ case eintmodNONE:
+ case eintmodEXACTCUTOFF:
+ /* Nothing to do here */
+ break;
+ default:
+ gmx_incons("unimplemented potential modifier");
}
+ ic->sh_invrc6 = -ic->dispersion_shift.cpot;
+
/* Electrostatics */
- ic->eeltype = fr->eeltype;
- ic->rcoulomb = fr->rcoulomb;
- ic->epsilon_r = fr->epsilon_r;
- ic->epsfac = fr->epsfac;
+ ic->eeltype = fr->eeltype;
+ ic->coulomb_modifier = fr->coulomb_modifier;
+ ic->rcoulomb = fr->rcoulomb;
+ ic->epsilon_r = fr->epsilon_r;
+ ic->epsfac = fr->epsfac;
+ ic->ewaldcoeff_q = fr->ewaldcoeff_q;
- /* Ewald */
- ic->ewaldcoeff = fr->ewaldcoeff;
if (fr->coulomb_modifier == eintmodPOTSHIFT)
{
- ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+ ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
}
else
{
if (fp != NULL)
{
- fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
- sqr(ic->sh_invrc6), ic->sh_invrc6);
+ real dispersion_shift;
+
+ dispersion_shift = ic->dispersion_shift.cpot;
+ if (EVDW_PME(ic->vdwtype))
+ {
+ dispersion_shift -= ic->sh_lj_ewald;
+ }
+ fprintf(fp, "Potential shift: LJ r^-12: %.3e r^-6: %.3e",
+ ic->repulsion_shift.cpot, dispersion_shift);
+
if (ic->eeltype == eelCUT)
{
- fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+ fprintf(fp, ", Coulomb %.e", -ic->c_rf);
}
else if (EEL_PME(ic->eeltype))
{
- fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+ fprintf(fp, ", Ewald %.3e", -ic->sh_ewald);
}
fprintf(fp, "\n");
}
if (fr->nbv != NULL && fr->nbv->bUseGPU)
{
nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
}
bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
static void init_nb_verlet(FILE *fp,
nonbonded_verlet_t **nb_verlet,
+ gmx_bool bFEP_NonBonded,
const t_inputrec *ir,
const t_forcerec *fr,
const t_commrec *cr,
pick_nbnxn_resources(cr, fr->hwinfo,
fr->bNonbonded,
&nbv->bUseGPU,
- &bEmulateGPU);
+ &bEmulateGPU,
+ fr->gpu_opt);
nbv->nbs = NULL;
if (i == 0) /* local */
{
- pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
nbv->bUseGPU, bEmulateGPU, ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
{
/* Use GPU for local, select a CPU kernel for non-local */
- pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+ pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
FALSE, FALSE, ir,
&nbv->grp[i].kernel_type,
&nbv->grp[i].ewald_excl,
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
nbnxn_cuda_init(fp, &nbv->cu_nbv,
- &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ &fr->hwinfo->gpu_info, fr->gpu_opt,
+ cr->rank_pp_intranode,
(nbv->ngrp > 1) && !bHybridGPURun);
if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
nbnxn_init_search(&nbv->nbs,
DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
- gmx_omp_nthreads_get(emntNonbonded));
+ bFEP_NonBonded,
+ gmx_omp_nthreads_get(emntPairsearch));
for (i = 0; i < nbv->ngrp; i++)
{
if (i == 0 ||
nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
{
+ gmx_bool bSimpleList;
+ int enbnxninitcombrule;
+
+ bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
+
+ if (bSimpleList && (fr->vdwtype == evdwCUT && (fr->vdw_modifier == eintmodNONE || fr->vdw_modifier == eintmodPOTSHIFT)))
+ {
+ /* Plain LJ cut-off: we can optimize with combination rules */
+ enbnxninitcombrule = enbnxninitcombruleDETECT;
+ }
+ else if (fr->vdwtype == evdwPME)
+ {
+ /* LJ-PME: we need to use a combination rule for the grid */
+ if (fr->ljpme_combination_rule == eljpmeGEOM)
+ {
+ enbnxninitcombrule = enbnxninitcombruleGEOM;
+ }
+ else
+ {
+ enbnxninitcombrule = enbnxninitcombruleLB;
+ }
+ }
+ else
+ {
+ /* We use a full combination matrix: no rule required */
+ enbnxninitcombrule = enbnxninitcombruleNONE;
+ }
+
+
snew(nbv->grp[i].nbat, 1);
nbnxn_atomdata_init(fp,
nbv->grp[i].nbat,
nbv->grp[i].kernel_type,
+ enbnxninitcombrule,
fr->ntype, fr->nbfp,
ir->opts.ngener,
- nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+ bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
nb_alloc, nb_free);
}
else
real rtab;
char *env;
double dbl;
- rvec box_size;
const t_block *cgs;
gmx_bool bGenericKernelOnly;
- gmx_bool bTab, bSep14tab, bNormalnblists;
+ gmx_bool bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
+ gmx_bool bFEP_NonBonded;
t_nblists *nbl;
int *nm_ind, egp_flags;
* In mdrun, hwinfo has already been set before calling init_forcerec.
* Here we ignore GPUs, as tools will not use them anyhow.
*/
- fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
}
- /* By default we turn acceleration on, but it might be turned off further down... */
- fr->use_cpu_acceleration = TRUE;
+ /* By default we turn SIMD kernels on, but it might be turned off further down... */
+ fr->use_simd_kernels = TRUE;
fr->bDomDec = DOMAINDECOMP(cr);
bNoSolvOpt = TRUE;
}
- if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+ if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
{
- fr->use_cpu_acceleration = FALSE;
+ fr->use_simd_kernels = FALSE;
if (fp != NULL)
{
fprintf(fp,
- "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
- "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+ "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
+ "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
}
}
fr->AllvsAll_workgb = NULL;
/* All-vs-all kernels have not been implemented in 4.6, and
- * the SIMD group kernels are also buggy in this case. Non-accelerated
+ * the SIMD group kernels are also buggy in this case. Non-SIMD
* group kernels are OK. See Redmine #1249. */
if (fr->bAllvsAll)
{
fr->bAllvsAll = FALSE;
- fr->use_cpu_acceleration = FALSE;
+ fr->use_simd_kernels = FALSE;
if (fp != NULL)
{
fprintf(fp,
fr->bGrid = (ir->ns_type == ensGRID);
fr->ePBC = ir->ePBC;
+ if (fr->cutoff_scheme == ecutsGROUP)
+ {
+ const char *note = "NOTE: This file uses the deprecated 'group' cutoff_scheme. This will be\n"
+ "removed in a future release when 'verlet' supports all interaction forms.\n";
+
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "\n%s\n", note);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "\n%s\n", note);
+ }
+ }
+
/* Determine if we will do PBC for distances in bonded interactions */
if (fr->ePBC == epbcNONE)
{
fr->rc_scaling = ir->refcoord_scaling;
copy_rvec(ir->posres_com, fr->posres_com);
copy_rvec(ir->posres_comB, fr->posres_comB);
- fr->rlist = cutoff_inf(ir->rlist);
- fr->rlistlong = cutoff_inf(ir->rlistlong);
- fr->eeltype = ir->coulombtype;
- fr->vdwtype = ir->vdwtype;
+ fr->rlist = cutoff_inf(ir->rlist);
+ fr->rlistlong = cutoff_inf(ir->rlistlong);
+ fr->eeltype = ir->coulombtype;
+ fr->vdwtype = ir->vdwtype;
+ fr->ljpme_combination_rule = ir->ljpme_combination_rule;
fr->coulomb_modifier = ir->coulomb_modifier;
fr->vdw_modifier = ir->vdw_modifier;
fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
}
break;
+ case evdwPME:
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LJEWALD;
+ break;
case evdwSWITCH:
case evdwSHIFT:
fr->nbkernel_elec_modifier = fr->coulomb_modifier;
fr->nbkernel_vdw_modifier = fr->vdw_modifier;
+ fr->rvdw = cutoff_inf(ir->rvdw);
+ fr->rvdw_switch = ir->rvdw_switch;
+ fr->rcoulomb = cutoff_inf(ir->rcoulomb);
+ fr->rcoulomb_switch = ir->rcoulomb_switch;
+
fr->bTwinRange = fr->rlistlong > fr->rlist;
fr->bEwald = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
if (ir->cutoff_scheme == ecutsGROUP)
{
- fr->bvdwtab = (fr->vdwtype != evdwCUT ||
- !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+ fr->bvdwtab = ((fr->vdwtype != evdwCUT || !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+ && !EVDW_PME(fr->vdwtype));
/* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
fr->bcoultab = !(fr->eeltype == eelCUT ||
fr->eeltype == eelEWALD ||
/* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
* going to be faster to tabulate the interaction than calling the generic kernel.
+ * However, if generic kernels have been requested we keep things analytically.
*/
- if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+ if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH &&
+ fr->nbkernel_vdw_modifier == eintmodPOTSWITCH &&
+ bGenericKernelOnly == FALSE)
{
if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
{
fr->bcoultab = TRUE;
+ /* Once we tabulate electrostatics, we can use the switch function for LJ,
+ * which would otherwise need two tables.
+ */
}
}
else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
(fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
{
- if (fr->rcoulomb != fr->rvdw)
+ if ((fr->rcoulomb != fr->rvdw) && (bGenericKernelOnly == FALSE))
{
fr->bcoultab = TRUE;
}
}
+ if (fr->nbkernel_elec_modifier == eintmodFORCESWITCH)
+ {
+ fr->bcoultab = TRUE;
+ }
+ if (fr->nbkernel_vdw_modifier == eintmodFORCESWITCH)
+ {
+ fr->bvdwtab = TRUE;
+ }
+
if (getenv("GMX_REQUIRE_TABLES"))
{
fr->bvdwtab = TRUE;
{
if (fp)
{
- fprintf(fp, "Will do PME sum in reciprocal space.\n");
+ fprintf(fp, "Will do PME sum in reciprocal space for electrostatic interactions.\n");
}
if (ir->coulombtype == eelP3M_AD)
{
please_cite(fp, "In-Chul99a");
}
}
- fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ fr->ewaldcoeff_q = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol);
init_ewald_tab(&(fr->ewald_table), ir, fp);
if (fp)
{
fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
- 1/fr->ewaldcoeff);
+ 1/fr->ewaldcoeff_q);
+ }
+ }
+
+ if (EVDW_PME(ir->vdwtype))
+ {
+ if (fp)
+ {
+ fprintf(fp, "Will do PME sum in reciprocal space for LJ dispersion interactions.\n");
+ }
+ please_cite(fp, "Essmann95a");
+ fr->ewaldcoeff_lj = calc_ewaldcoeff_lj(ir->rvdw, ir->ewald_rtol_lj);
+ if (fp)
+ {
+ fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for LJ Ewald\n",
+ 1/fr->ewaldcoeff_lj);
}
}
fr->epsilon_r = ir->epsilon_r;
fr->epsilon_rf = ir->epsilon_rf;
fr->fudgeQQ = mtop->ffparams.fudgeQQ;
- fr->rcoulomb_switch = ir->rcoulomb_switch;
- fr->rcoulomb = cutoff_inf(ir->rcoulomb);
/* Parameters for generalized RF */
fr->zsquare = 0.0;
{
init_generalized_rf(fp, mtop, ir, fr);
}
- else if (fr->eeltype == eelSHIFT)
- {
- for (m = 0; (m < DIM); m++)
- {
- box_size[m] = box[m][m];
- }
-
- if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
- {
- set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
- }
- }
- fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+ fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype) ||
gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
IR_ELEC_FIELD(*ir) ||
{
fr->ntype = mtop->ffparams.atnr;
fr->nbfp = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+ if (EVDW_PME(fr->vdwtype))
+ {
+ fr->ljpme_c6grid = make_ljpme_c6grid(&mtop->ffparams, fr);
+ }
}
/* Copy the energy group exclusions */
fr->egp_flags = ir->opts.egp_flags;
/* Van der Waals stuff */
- fr->rvdw = cutoff_inf(ir->rvdw);
- fr->rvdw_switch = ir->rvdw_switch;
if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
{
if (fr->rvdw_switch >= fr->rvdw)
}
}
+ if (fr->bBHAM && EVDW_PME(fr->vdwtype))
+ {
+ gmx_fatal(FARGS, "LJ PME not supported with Buckingham");
+ }
+
if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
{
gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
}
+ if (fr->bBHAM && fr->cutoff_scheme == ecutsVERLET)
+ {
+ gmx_fatal(FARGS, "Verlet cutoff-scheme is not supported with Buckingham");
+ }
+
if (fp)
{
fprintf(fp, "Cut-off's: NS: %g Coulomb: %g %s: %g\n",
fr->gbtabr = 100;
fr->gbtab = make_gb_table(oenv, fr);
- init_gb(&fr->born, cr, fr, ir, mtop, ir->gb_algorithm);
+ init_gb(&fr->born, fr, ir, mtop, ir->gb_algorithm);
/* Copy local gb data (for dd, this is done in dd_partition_system) */
if (!DOMAINDECOMP(cr))
&fr->kappa, &fr->k_rf, &fr->c_rf);
}
+ /*This now calculates sum for q and c6*/
set_chargesum(fp, fr, mtop);
/* if we are using LR electrostatics, and they are tabulated,
* A little unnecessary to make both vdw and coul tables sometimes,
* but what the heck... */
- bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+ bMakeTables = fr->bcoultab || fr->bvdwtab || fr->bEwald ||
+ (ir->eDispCorr != edispcNO && ir_vdw_switched(ir));
- bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
- fr->bBHAM || fr->bEwald) &&
- (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
- gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
- gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+ bMakeSeparate14Table = ((!bMakeTables || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+ fr->coulomb_modifier != eintmodNONE ||
+ fr->vdw_modifier != eintmodNONE ||
+ fr->bBHAM || fr->bEwald) &&
+ (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
negp_pp = ir->opts.ngener - ir->nwall;
negptable = 0;
- if (!bTab)
+ if (!bMakeTables)
{
- bNormalnblists = TRUE;
- fr->nnblists = 1;
+ bSomeNormalNbListsAreInUse = TRUE;
+ fr->nnblists = 1;
}
else
{
- bNormalnblists = (ir->eDispCorr != edispcNO);
+ bSomeNormalNbListsAreInUse = (ir->eDispCorr != edispcNO);
for (egi = 0; egi < negp_pp; egi++)
{
for (egj = egi; egj < negp_pp; egj++)
}
else
{
- bNormalnblists = TRUE;
+ bSomeNormalNbListsAreInUse = TRUE;
}
}
}
}
- if (bNormalnblists)
+ if (bSomeNormalNbListsAreInUse)
{
fr->nnblists = negptable + 1;
}
*/
rtab = ir->rlistlong + ir->tabext;
- if (bTab)
+ if (bMakeTables)
{
/* make tables for ordinary interactions */
- if (bNormalnblists)
+ if (bSomeNormalNbListsAreInUse)
{
make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
if (ir->adress)
{
make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
}
- if (!bSep14tab)
+ if (!bMakeSeparate14Table)
{
fr->tab14 = fr->nblists[0].table_elec_vdw;
}
}
}
}
- if (bSep14tab)
+ else if ((fr->eDispCorr != edispcNO) &&
+ ((fr->vdw_modifier == eintmodPOTSWITCH) ||
+ (fr->vdw_modifier == eintmodFORCESWITCH) ||
+ (fr->vdw_modifier == eintmodPOTSHIFT)))
+ {
+ /* Tables might not be used for the potential modifier interactions per se, but
+ * we still need them to evaluate switch/shift dispersion corrections in this case.
+ */
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+ }
+
+ if (bMakeSeparate14Table)
{
/* generate extra tables with plain Coulomb for 1-4 interactions only */
fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
/* Set all the static charge group info */
fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+ &bFEP_NonBonded,
&fr->bExcl_IntraCGAll_InterCGNone);
if (DOMAINDECOMP(cr))
{
if (!DOMAINDECOMP(cr))
{
- /* When using particle decomposition, the effect of the second argument,
- * which sets fr->hcg, is corrected later in do_md and init_em.
- */
forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
mtop->natoms, mtop->natoms, mtop->natoms);
}
gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
}
- init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+ init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
}
/* fr->ic is used both by verlet and group kernels (to some extent) now */
- init_interaction_const(fp, &fr->ic, fr, rtab);
+ init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
if (ir->eDispCorr != edispcNO)
{
calc_enervirdiff(fp, ir->eDispCorr, fr);
fflush(fp);
}
-void forcerec_set_excl_load(t_forcerec *fr,
- const gmx_localtop_t *top, const t_commrec *cr)
+void forcerec_set_excl_load(t_forcerec *fr,
+ const gmx_localtop_t *top)
{
const int *ind, *a;
int t, i, j, ntot, n, ntarget;
- if (cr != NULL && PARTDECOMP(cr))
- {
- /* No OpenMP with particle decomposition */
- pd_at_range(cr,
- &fr->excl_load[0],
- &fr->excl_load[1]);
-
- return;
- }
-
ind = top->excls.index;
a = top->excls.a;
fr->excl_load[t] = i;
}
}
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+void free_gpu_resources(const t_forcerec *fr,
+ const t_commrec *cr)
+{
+ gmx_bool bIsPPrankUsingGPU;
+ char gpu_err_str[STRLEN];
+
+ bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+
+ if (bIsPPrankUsingGPU)
+ {
+ /* free nbnxn data in GPU memory */
+ nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the context in free_gpu() as some ranks may be sharing
+ * GPU and context.
+ * Note: as only PP ranks need to free GPU resources, so it is safe to
+ * not call the barrier on PME ranks.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ /* uninitialize GPU (by destroying the context) */
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On rank %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+}