2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
47 #include "gromacs/math/utilities.h"
51 #include "gmx_fatal.h"
52 #include "gmx_fatal_collective.h"
56 #include "nonbonded.h"
65 #include "md_support.h"
66 #include "md_logging.h"
70 #include "mtop_util.h"
71 #include "nbnxn_simd.h"
72 #include "nbnxn_search.h"
73 #include "nbnxn_atomdata.h"
74 #include "nbnxn_consts.h"
75 #include "gmx_omp_nthreads.h"
76 #include "gmx_detect_hardware.h"
80 /* MSVC definition for __cpuid() */
84 #include "types/nbnxn_cuda_types_ext.h"
85 #include "gpu_utils.h"
86 #include "nbnxn_cuda_data_mgmt.h"
87 #include "pmalloc_cuda.h"
89 t_forcerec *mk_forcerec(void)
99 static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
103 for (i = 0; (i < atnr); i++)
105 for (j = 0; (j < atnr); j++)
107 fprintf(fp, "%2d - %2d", i, j);
110 fprintf(fp, " a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
111 BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
115 fprintf(fp, " c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
116 C12(nbfp, atnr, i, j)/12.0);
123 static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
131 snew(nbfp, 3*atnr*atnr);
132 for (i = k = 0; (i < atnr); i++)
134 for (j = 0; (j < atnr); j++, k++)
136 BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
137 BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
138 /* nbfp now includes the 6.0 derivative prefactor */
139 BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
145 snew(nbfp, 2*atnr*atnr);
146 for (i = k = 0; (i < atnr); i++)
148 for (j = 0; (j < atnr); j++, k++)
150 /* nbfp now includes the 6.0/12.0 derivative prefactors */
151 C6(nbfp, atnr, i, j) = idef->iparams[k].lj.c6*6.0;
152 C12(nbfp, atnr, i, j) = idef->iparams[k].lj.c12*12.0;
160 static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
164 real c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
168 snew(nbfp, 2*atnr*atnr);
169 for (i = 0; i < atnr; ++i)
171 for (j = 0; j < atnr; ++j)
173 c6i = idef->iparams[i*(atnr+1)].lj.c6;
174 c12i = idef->iparams[i*(atnr+1)].lj.c12;
175 c6j = idef->iparams[j*(atnr+1)].lj.c6;
176 c12j = idef->iparams[j*(atnr+1)].lj.c12;
177 c6 = sqrt(c6i * c6j);
178 c12 = sqrt(c12i * c12j);
179 if (comb_rule == eCOMB_ARITHMETIC
180 && !gmx_numzero(c6) && !gmx_numzero(c12))
182 sigmai = pow(c12i / c6i, 1.0/6.0);
183 sigmaj = pow(c12j / c6j, 1.0/6.0);
184 epsi = c6i * c6i / c12i;
185 epsj = c6j * c6j / c12j;
186 c6 = epsi * epsj * pow(0.5*(sigmai+sigmaj), 6);
187 c12 = epsi * epsj * pow(0.5*(sigmai+sigmaj), 12);
189 C6(nbfp, atnr, i, j) = c6*6.0;
190 C12(nbfp, atnr, i, j) = c12*12.0;
196 /* This routine sets fr->solvent_opt to the most common solvent in the
197 * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
198 * the fr->solvent_type array with the correct type (or esolNO).
200 * Charge groups that fulfill the conditions but are not identical to the
201 * most common one will be marked as esolNO in the solvent_type array.
203 * TIP3p is identical to SPC for these purposes, so we call it
204 * SPC in the arrays (Apologies to Bill Jorgensen ;-)
206 * NOTE: QM particle should not
207 * become an optimized solvent. Not even if there is only one charge
217 } solvent_parameters_t;
220 check_solvent_cg(const gmx_moltype_t *molt,
223 const unsigned char *qm_grpnr,
224 const t_grps *qm_grps,
226 int *n_solvent_parameters,
227 solvent_parameters_t **solvent_parameters_p,
231 const t_blocka * excl;
242 solvent_parameters_t *solvent_parameters;
244 /* We use a list with parameters for each solvent type.
245 * Every time we discover a new molecule that fulfills the basic
246 * conditions for a solvent we compare with the previous entries
247 * in these lists. If the parameters are the same we just increment
248 * the counter for that type, and otherwise we create a new type
249 * based on the current molecule.
251 * Once we've finished going through all molecules we check which
252 * solvent is most common, and mark all those molecules while we
253 * clear the flag on all others.
256 solvent_parameters = *solvent_parameters_p;
258 /* Mark the cg first as non optimized */
261 /* Check if this cg has no exclusions with atoms in other charge groups
262 * and all atoms inside the charge group excluded.
263 * We only have 3 or 4 atom solvent loops.
265 if (GET_CGINFO_EXCL_INTER(cginfo) ||
266 !GET_CGINFO_EXCL_INTRA(cginfo))
271 /* Get the indices of the first atom in this charge group */
272 j0 = molt->cgs.index[cg0];
273 j1 = molt->cgs.index[cg0+1];
275 /* Number of atoms in our molecule */
281 "Moltype '%s': there are %d atoms in this charge group\n",
285 /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
288 if (nj < 3 || nj > 4)
293 /* Check if we are doing QM on this group */
295 if (qm_grpnr != NULL)
297 for (j = j0; j < j1 && !qm; j++)
299 qm = (qm_grpnr[j] < qm_grps->nr - 1);
302 /* Cannot use solvent optimization with QM */
308 atom = molt->atoms.atom;
310 /* Still looks like a solvent, time to check parameters */
312 /* If it is perturbed (free energy) we can't use the solvent loops,
313 * so then we just skip to the next molecule.
317 for (j = j0; j < j1 && !perturbed; j++)
319 perturbed = PERTURBED(atom[j]);
327 /* Now it's only a question if the VdW and charge parameters
328 * are OK. Before doing the check we compare and see if they are
329 * identical to a possible previous solvent type.
330 * First we assign the current types and charges.
332 for (j = 0; j < nj; j++)
334 tmp_vdwtype[j] = atom[j0+j].type;
335 tmp_charge[j] = atom[j0+j].q;
338 /* Does it match any previous solvent type? */
339 for (k = 0; k < *n_solvent_parameters; k++)
344 /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
345 if ( (solvent_parameters[k].model == esolSPC && nj != 3) ||
346 (solvent_parameters[k].model == esolTIP4P && nj != 4) )
351 /* Check that types & charges match for all atoms in molecule */
352 for (j = 0; j < nj && match == TRUE; j++)
354 if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
358 if (tmp_charge[j] != solvent_parameters[k].charge[j])
365 /* Congratulations! We have a matched solvent.
366 * Flag it with this type for later processing.
369 solvent_parameters[k].count += nmol;
371 /* We are done with this charge group */
376 /* If we get here, we have a tentative new solvent type.
377 * Before we add it we must check that it fulfills the requirements
378 * of the solvent optimized loops. First determine which atoms have
381 for (j = 0; j < nj; j++)
384 tjA = tmp_vdwtype[j];
386 /* Go through all other tpes and see if any have non-zero
387 * VdW parameters when combined with this one.
389 for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
391 /* We already checked that the atoms weren't perturbed,
392 * so we only need to check state A now.
396 has_vdw[j] = (has_vdw[j] ||
397 (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
398 (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
399 (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
404 has_vdw[j] = (has_vdw[j] ||
405 (C6(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
406 (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
411 /* Now we know all we need to make the final check and assignment. */
415 * For this we require thatn all atoms have charge,
416 * the charges on atom 2 & 3 should be the same, and only
417 * atom 1 might have VdW.
419 if (has_vdw[1] == FALSE &&
420 has_vdw[2] == FALSE &&
421 tmp_charge[0] != 0 &&
422 tmp_charge[1] != 0 &&
423 tmp_charge[2] == tmp_charge[1])
425 srenew(solvent_parameters, *n_solvent_parameters+1);
426 solvent_parameters[*n_solvent_parameters].model = esolSPC;
427 solvent_parameters[*n_solvent_parameters].count = nmol;
428 for (k = 0; k < 3; k++)
430 solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
431 solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
434 *cg_sp = *n_solvent_parameters;
435 (*n_solvent_parameters)++;
440 /* Or could it be a TIP4P?
441 * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
442 * Only atom 1 mght have VdW.
444 if (has_vdw[1] == FALSE &&
445 has_vdw[2] == FALSE &&
446 has_vdw[3] == FALSE &&
447 tmp_charge[0] == 0 &&
448 tmp_charge[1] != 0 &&
449 tmp_charge[2] == tmp_charge[1] &&
452 srenew(solvent_parameters, *n_solvent_parameters+1);
453 solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
454 solvent_parameters[*n_solvent_parameters].count = nmol;
455 for (k = 0; k < 4; k++)
457 solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
458 solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
461 *cg_sp = *n_solvent_parameters;
462 (*n_solvent_parameters)++;
466 *solvent_parameters_p = solvent_parameters;
470 check_solvent(FILE * fp,
471 const gmx_mtop_t * mtop,
473 cginfo_mb_t *cginfo_mb)
476 const t_block * mols;
477 const gmx_moltype_t *molt;
478 int mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
479 int n_solvent_parameters;
480 solvent_parameters_t *solvent_parameters;
486 fprintf(debug, "Going to determine what solvent types we have.\n");
491 n_solvent_parameters = 0;
492 solvent_parameters = NULL;
493 /* Allocate temporary array for solvent type */
494 snew(cg_sp, mtop->nmolblock);
498 for (mb = 0; mb < mtop->nmolblock; mb++)
500 molt = &mtop->moltype[mtop->molblock[mb].type];
502 /* Here we have to loop over all individual molecules
503 * because we need to check for QMMM particles.
505 snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
506 nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
507 nmol = mtop->molblock[mb].nmol/nmol_ch;
508 for (mol = 0; mol < nmol_ch; mol++)
511 am = mol*cgs->index[cgs->nr];
512 for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
514 check_solvent_cg(molt, cg_mol, nmol,
515 mtop->groups.grpnr[egcQMMM] ?
516 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
517 &mtop->groups.grps[egcQMMM],
519 &n_solvent_parameters, &solvent_parameters,
520 cginfo_mb[mb].cginfo[cgm+cg_mol],
521 &cg_sp[mb][cgm+cg_mol]);
524 cg_offset += cgs->nr;
525 at_offset += cgs->index[cgs->nr];
528 /* Puh! We finished going through all charge groups.
529 * Now find the most common solvent model.
532 /* Most common solvent this far */
534 for (i = 0; i < n_solvent_parameters; i++)
537 solvent_parameters[i].count > solvent_parameters[bestsp].count)
545 bestsol = solvent_parameters[bestsp].model;
552 #ifdef DISABLE_WATER_NLIST
557 for (mb = 0; mb < mtop->nmolblock; mb++)
559 cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
560 nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
561 for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
563 if (cg_sp[mb][i] == bestsp)
565 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
570 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
577 if (bestsol != esolNO && fp != NULL)
579 fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
581 solvent_parameters[bestsp].count);
584 sfree(solvent_parameters);
585 fr->solvent_opt = bestsol;
589 acNONE = 0, acCONSTRAINT, acSETTLE
592 static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
593 t_forcerec *fr, gmx_bool bNoSolvOpt,
594 gmx_bool *bExcl_IntraCGAll_InterCGNone)
597 const t_blocka *excl;
598 const gmx_moltype_t *molt;
599 const gmx_molblock_t *molb;
600 cginfo_mb_t *cginfo_mb;
603 int cg_offset, a_offset, cgm, am;
604 int mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
608 gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
610 ncg_tot = ncg_mtop(mtop);
611 snew(cginfo_mb, mtop->nmolblock);
613 snew(type_VDW, fr->ntype);
614 for (ai = 0; ai < fr->ntype; ai++)
616 type_VDW[ai] = FALSE;
617 for (j = 0; j < fr->ntype; j++)
619 type_VDW[ai] = type_VDW[ai] ||
621 C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
622 C12(fr->nbfp, fr->ntype, ai, j) != 0;
626 *bExcl_IntraCGAll_InterCGNone = TRUE;
629 snew(bExcl, excl_nalloc);
632 for (mb = 0; mb < mtop->nmolblock; mb++)
634 molb = &mtop->molblock[mb];
635 molt = &mtop->moltype[molb->type];
639 /* Check if the cginfo is identical for all molecules in this block.
640 * If so, we only need an array of the size of one molecule.
641 * Otherwise we make an array of #mol times #cgs per molecule.
645 for (m = 0; m < molb->nmol; m++)
647 am = m*cgs->index[cgs->nr];
648 for (cg = 0; cg < cgs->nr; cg++)
651 a1 = cgs->index[cg+1];
652 if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
653 ggrpnr(&mtop->groups, egcENER, a_offset +a0))
657 if (mtop->groups.grpnr[egcQMMM] != NULL)
659 for (ai = a0; ai < a1; ai++)
661 if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
662 mtop->groups.grpnr[egcQMMM][a_offset +ai])
671 cginfo_mb[mb].cg_start = cg_offset;
672 cginfo_mb[mb].cg_end = cg_offset + molb->nmol*cgs->nr;
673 cginfo_mb[mb].cg_mod = (bId ? 1 : molb->nmol)*cgs->nr;
674 snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
675 cginfo = cginfo_mb[mb].cginfo;
677 /* Set constraints flags for constrained atoms */
678 snew(a_con, molt->atoms.nr);
679 for (ftype = 0; ftype < F_NRE; ftype++)
681 if (interaction_function[ftype].flags & IF_CONSTRAINT)
686 for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
690 for (a = 0; a < nral; a++)
692 a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
693 (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
699 for (m = 0; m < (bId ? 1 : molb->nmol); m++)
702 am = m*cgs->index[cgs->nr];
703 for (cg = 0; cg < cgs->nr; cg++)
706 a1 = cgs->index[cg+1];
708 /* Store the energy group in cginfo */
709 gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
710 SET_CGINFO_GID(cginfo[cgm+cg], gid);
712 /* Check the intra/inter charge group exclusions */
713 if (a1-a0 > excl_nalloc)
715 excl_nalloc = a1 - a0;
716 srenew(bExcl, excl_nalloc);
718 /* bExclIntraAll: all intra cg interactions excluded
719 * bExclInter: any inter cg interactions excluded
721 bExclIntraAll = TRUE;
725 for (ai = a0; ai < a1; ai++)
727 /* Check VDW and electrostatic interactions */
728 bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
729 type_VDW[molt->atoms.atom[ai].typeB]);
730 bHaveQ = bHaveQ || (molt->atoms.atom[ai].q != 0 ||
731 molt->atoms.atom[ai].qB != 0);
733 /* Clear the exclusion list for atom ai */
734 for (aj = a0; aj < a1; aj++)
736 bExcl[aj-a0] = FALSE;
738 /* Loop over all the exclusions of atom ai */
739 for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
742 if (aj < a0 || aj >= a1)
751 /* Check if ai excludes a0 to a1 */
752 for (aj = a0; aj < a1; aj++)
756 bExclIntraAll = FALSE;
763 SET_CGINFO_CONSTR(cginfo[cgm+cg]);
766 SET_CGINFO_SETTLE(cginfo[cgm+cg]);
774 SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
778 SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
780 if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
782 /* The size in cginfo is currently only read with DD */
783 gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
787 SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
791 SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
793 /* Store the charge group size */
794 SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
796 if (!bExclIntraAll || bExclInter)
798 *bExcl_IntraCGAll_InterCGNone = FALSE;
805 cg_offset += molb->nmol*cgs->nr;
806 a_offset += molb->nmol*cgs->index[cgs->nr];
810 /* the solvent optimizer is called after the QM is initialized,
811 * because we don't want to have the QM subsystemto become an
815 check_solvent(fplog, mtop, fr, cginfo_mb);
817 if (getenv("GMX_NO_SOLV_OPT"))
821 fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
822 "Disabling all solvent optimization\n");
824 fr->solvent_opt = esolNO;
828 fr->solvent_opt = esolNO;
830 if (!fr->solvent_opt)
832 for (mb = 0; mb < mtop->nmolblock; mb++)
834 for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
836 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
844 static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
849 ncg = cgi_mb[nmb-1].cg_end;
852 for (cg = 0; cg < ncg; cg++)
854 while (cg >= cgi_mb[mb].cg_end)
859 cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
865 static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
867 /*This now calculates sum for q and c6*/
868 double qsum, q2sum, q, c6sum, c6;
870 const t_atoms *atoms;
875 for (mb = 0; mb < mtop->nmolblock; mb++)
877 nmol = mtop->molblock[mb].nmol;
878 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
879 for (i = 0; i < atoms->nr; i++)
881 q = atoms->atom[i].q;
884 c6 = mtop->ffparams.iparams[atoms->atom[i].type*(mtop->ffparams.atnr+1)].lj.c6;
889 fr->q2sum[0] = q2sum;
890 fr->c6sum[0] = c6sum;
892 if (fr->efep != efepNO)
897 for (mb = 0; mb < mtop->nmolblock; mb++)
899 nmol = mtop->molblock[mb].nmol;
900 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
901 for (i = 0; i < atoms->nr; i++)
903 q = atoms->atom[i].qB;
906 c6 = mtop->ffparams.iparams[atoms->atom[i].typeB*(mtop->ffparams.atnr+1)].lj.c6;
910 fr->q2sum[1] = q2sum;
911 fr->c6sum[1] = c6sum;
916 fr->qsum[1] = fr->qsum[0];
917 fr->q2sum[1] = fr->q2sum[0];
918 fr->c6sum[1] = fr->c6sum[0];
922 if (fr->efep == efepNO)
924 fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
928 fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
929 fr->qsum[0], fr->qsum[1]);
934 void update_forcerec(t_forcerec *fr, matrix box)
936 if (fr->eeltype == eelGRF)
938 calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
939 fr->rcoulomb, fr->temp, fr->zsquare, box,
940 &fr->kappa, &fr->k_rf, &fr->c_rf);
944 void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
946 const t_atoms *atoms, *atoms_tpi;
947 const t_blocka *excl;
948 int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
949 gmx_int64_t npair, npair_ij, tmpi, tmpj;
950 double csix, ctwelve;
954 real *nbfp_comb = NULL;
960 /* For LJ-PME, we want to correct for the difference between the
961 * actual C6 values and the C6 values used by the LJ-PME based on
962 * combination rules. */
964 if (EVDW_PME(fr->vdwtype))
966 nbfp_comb = mk_nbfp_combination_rule(&mtop->ffparams,
967 (fr->ljpme_combination_rule == eljpmeLB) ? eCOMB_ARITHMETIC : eCOMB_GEOMETRIC);
968 for (tpi = 0; tpi < ntp; ++tpi)
970 for (tpj = 0; tpj < ntp; ++tpj)
972 C6(nbfp_comb, ntp, tpi, tpj) =
973 C6(nbfp, ntp, tpi, tpj) - C6(nbfp_comb, ntp, tpi, tpj);
974 C12(nbfp_comb, ntp, tpi, tpj) = C12(nbfp, ntp, tpi, tpj);
979 for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
987 /* Count the types so we avoid natoms^2 operations */
988 snew(typecount, ntp);
989 gmx_mtop_count_atomtypes(mtop, q, typecount);
991 for (tpi = 0; tpi < ntp; tpi++)
993 for (tpj = tpi; tpj < ntp; tpj++)
995 tmpi = typecount[tpi];
996 tmpj = typecount[tpj];
999 npair_ij = tmpi*tmpj;
1003 npair_ij = tmpi*(tmpi - 1)/2;
1007 /* nbfp now includes the 6.0 derivative prefactor */
1008 csix += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1012 /* nbfp now includes the 6.0/12.0 derivative prefactors */
1013 csix += npair_ij* C6(nbfp, ntp, tpi, tpj)/6.0;
1014 ctwelve += npair_ij* C12(nbfp, ntp, tpi, tpj)/12.0;
1020 /* Subtract the excluded pairs.
1021 * The main reason for substracting exclusions is that in some cases
1022 * some combinations might never occur and the parameters could have
1023 * any value. These unused values should not influence the dispersion
1026 for (mb = 0; mb < mtop->nmolblock; mb++)
1028 nmol = mtop->molblock[mb].nmol;
1029 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
1030 excl = &mtop->moltype[mtop->molblock[mb].type].excls;
1031 for (i = 0; (i < atoms->nr); i++)
1035 tpi = atoms->atom[i].type;
1039 tpi = atoms->atom[i].typeB;
1041 j1 = excl->index[i];
1042 j2 = excl->index[i+1];
1043 for (j = j1; j < j2; j++)
1050 tpj = atoms->atom[k].type;
1054 tpj = atoms->atom[k].typeB;
1058 /* nbfp now includes the 6.0 derivative prefactor */
1059 csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1063 /* nbfp now includes the 6.0/12.0 derivative prefactors */
1064 csix -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
1065 ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
1075 /* Only correct for the interaction of the test particle
1076 * with the rest of the system.
1079 &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
1082 for (mb = 0; mb < mtop->nmolblock; mb++)
1084 nmol = mtop->molblock[mb].nmol;
1085 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
1086 for (j = 0; j < atoms->nr; j++)
1089 /* Remove the interaction of the test charge group
1092 if (mb == mtop->nmolblock-1)
1096 if (mb == 0 && nmol == 1)
1098 gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
1103 tpj = atoms->atom[j].type;
1107 tpj = atoms->atom[j].typeB;
1109 for (i = 0; i < fr->n_tpi; i++)
1113 tpi = atoms_tpi->atom[i].type;
1117 tpi = atoms_tpi->atom[i].typeB;
1121 /* nbfp now includes the 6.0 derivative prefactor */
1122 csix += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1126 /* nbfp now includes the 6.0/12.0 derivative prefactors */
1127 csix += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
1128 ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
1135 if (npair - nexcl <= 0 && fplog)
1137 fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
1143 csix /= npair - nexcl;
1144 ctwelve /= npair - nexcl;
1148 fprintf(debug, "Counted %d exclusions\n", nexcl);
1149 fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
1150 fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
1152 fr->avcsix[q] = csix;
1153 fr->avctwelve[q] = ctwelve;
1156 if (EVDW_PME(fr->vdwtype))
1163 if (fr->eDispCorr == edispcAllEner ||
1164 fr->eDispCorr == edispcAllEnerPres)
1166 fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
1167 fr->avcsix[0], fr->avctwelve[0]);
1171 fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
1177 static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
1178 const gmx_mtop_t *mtop)
1180 const t_atoms *at1, *at2;
1181 int mt1, mt2, i, j, tpi, tpj, ntypes;
1187 fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
1194 for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
1196 at1 = &mtop->moltype[mt1].atoms;
1197 for (i = 0; (i < at1->nr); i++)
1199 tpi = at1->atom[i].type;
1202 gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
1205 for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
1207 at2 = &mtop->moltype[mt2].atoms;
1208 for (j = 0; (j < at2->nr); j++)
1210 tpj = at2->atom[j].type;
1213 gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
1215 b = BHAMB(nbfp, ntypes, tpi, tpj);
1216 if (b > fr->bham_b_max)
1220 if ((b < bmin) || (bmin == -1))
1230 fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
1231 bmin, fr->bham_b_max);
1235 static void make_nbf_tables(FILE *fp, const output_env_t oenv,
1236 t_forcerec *fr, real rtab,
1237 const t_commrec *cr,
1238 const char *tabfn, char *eg1, char *eg2,
1248 fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
1253 sprintf(buf, "%s", tabfn);
1256 /* Append the two energy group names */
1257 sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
1258 eg1, eg2, ftp2ext(efXVG));
1260 nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
1261 /* Copy the contents of the table to separate coulomb and LJ tables too,
1262 * to improve cache performance.
1264 /* For performance reasons we want
1265 * the table data to be aligned to 16-byte. The pointers could be freed
1266 * but currently aren't.
1268 nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
1269 nbl->table_elec.format = nbl->table_elec_vdw.format;
1270 nbl->table_elec.r = nbl->table_elec_vdw.r;
1271 nbl->table_elec.n = nbl->table_elec_vdw.n;
1272 nbl->table_elec.scale = nbl->table_elec_vdw.scale;
1273 nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
1274 nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
1275 nbl->table_elec.ninteractions = 1;
1276 nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
1277 snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
1279 nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
1280 nbl->table_vdw.format = nbl->table_elec_vdw.format;
1281 nbl->table_vdw.r = nbl->table_elec_vdw.r;
1282 nbl->table_vdw.n = nbl->table_elec_vdw.n;
1283 nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
1284 nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
1285 nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
1286 nbl->table_vdw.ninteractions = 2;
1287 nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
1288 snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
1290 for (i = 0; i <= nbl->table_elec_vdw.n; i++)
1292 for (j = 0; j < 4; j++)
1294 nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
1296 for (j = 0; j < 8; j++)
1298 nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
1303 static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
1304 int *ncount, int **count)
1306 const gmx_moltype_t *molt;
1308 int mt, ftype, stride, i, j, tabnr;
1310 for (mt = 0; mt < mtop->nmoltype; mt++)
1312 molt = &mtop->moltype[mt];
1313 for (ftype = 0; ftype < F_NRE; ftype++)
1315 if (ftype == ftype1 || ftype == ftype2)
1317 il = &molt->ilist[ftype];
1318 stride = 1 + NRAL(ftype);
1319 for (i = 0; i < il->nr; i += stride)
1321 tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
1324 gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
1326 if (tabnr >= *ncount)
1328 srenew(*count, tabnr+1);
1329 for (j = *ncount; j < tabnr+1; j++)
1342 static bondedtable_t *make_bonded_tables(FILE *fplog,
1343 int ftype1, int ftype2,
1344 const gmx_mtop_t *mtop,
1345 const char *basefn, const char *tabext)
1347 int i, ncount, *count;
1355 count_tables(ftype1, ftype2, mtop, &ncount, &count);
1360 for (i = 0; i < ncount; i++)
1364 sprintf(tabfn, "%s", basefn);
1365 sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
1366 tabext, i, ftp2ext(efXVG));
1367 tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
1376 void forcerec_set_ranges(t_forcerec *fr,
1377 int ncg_home, int ncg_force,
1379 int natoms_force_constr, int natoms_f_novirsum)
1384 /* fr->ncg_force is unused in the standard code,
1385 * but it can be useful for modified code dealing with charge groups.
1387 fr->ncg_force = ncg_force;
1388 fr->natoms_force = natoms_force;
1389 fr->natoms_force_constr = natoms_force_constr;
1391 if (fr->natoms_force_constr > fr->nalloc_force)
1393 fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
1397 srenew(fr->f_twin, fr->nalloc_force);
1401 if (fr->bF_NoVirSum)
1403 fr->f_novirsum_n = natoms_f_novirsum;
1404 if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
1406 fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
1407 srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
1412 fr->f_novirsum_n = 0;
1416 static real cutoff_inf(real cutoff)
1420 cutoff = GMX_CUTOFF_INF;
1426 static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
1427 t_forcerec *fr, const t_inputrec *ir,
1428 const char *tabfn, const gmx_mtop_t *mtop,
1436 gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
1440 snew(fr->atf_tabs, ir->adress->n_tf_grps);
1442 sprintf(buf, "%s", tabfn);
1443 for (i = 0; i < ir->adress->n_tf_grps; i++)
1445 j = ir->adress->tf_table_index[i]; /* get energy group index */
1446 sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
1447 *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
1450 fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
1452 fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
1457 gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
1464 ir->rcoulomb == 0 &&
1466 ir->ePBC == epbcNONE &&
1467 ir->vdwtype == evdwCUT &&
1468 ir->coulombtype == eelCUT &&
1469 ir->efep == efepNO &&
1470 (ir->implicit_solvent == eisNO ||
1471 (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
1472 ir->gb_algorithm == egbHCT ||
1473 ir->gb_algorithm == egbOBC))) &&
1474 getenv("GMX_NO_ALLVSALL") == NULL
1477 if (bAllvsAll && ir->opts.ngener > 1)
1479 const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
1485 fprintf(stderr, "\n%s\n", note);
1489 fprintf(fp, "\n%s\n", note);
1495 if (bAllvsAll && fp && MASTER(cr))
1497 fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n");
1504 static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
1508 /* These thread local data structures are used for bondeds only */
1509 fr->nthreads = gmx_omp_nthreads_get(emntBonded);
1511 if (fr->nthreads > 1)
1513 snew(fr->f_t, fr->nthreads);
1514 /* Thread 0 uses the global force and energy arrays */
1515 for (t = 1; t < fr->nthreads; t++)
1517 fr->f_t[t].f = NULL;
1518 fr->f_t[t].f_nalloc = 0;
1519 snew(fr->f_t[t].fshift, SHIFTS);
1520 fr->f_t[t].grpp.nener = nenergrp*nenergrp;
1521 for (i = 0; i < egNR; i++)
1523 snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
1530 gmx_bool nbnxn_acceleration_supported(FILE *fplog,
1531 const t_commrec *cr,
1532 const t_inputrec *ir,
1535 /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */
1538 if (ir->vdw_modifier == eintmodFORCESWITCH ||
1539 ir->vdw_modifier == eintmodPOTSWITCH ||
1540 ir->vdwtype == evdwPME)
1542 md_print_warn(cr, fplog, "LJ switch functions and LJ-PME are not yet supported on the GPU, falling back to CPU only\n");
1547 if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
1549 md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
1550 bGPU ? "GPUs" : "SIMD kernels",
1551 bGPU ? "CPU only" : "plain-C kernels");
1559 static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
1563 *kernel_type = nbnxnk4x4_PlainC;
1564 *ewald_excl = ewaldexclTable;
1566 #ifdef GMX_NBNXN_SIMD
1568 #ifdef GMX_NBNXN_SIMD_4XN
1569 *kernel_type = nbnxnk4xN_SIMD_4xN;
1571 #ifdef GMX_NBNXN_SIMD_2XNN
1572 *kernel_type = nbnxnk4xN_SIMD_2xNN;
1575 #if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
1576 /* We need to choose if we want 2x(N+N) or 4xN kernels.
1577 * Currently this is based on the SIMD acceleration choice,
1578 * but it might be better to decide this at runtime based on CPU.
1580 * 4xN calculates more (zero) interactions, but has less pair-search
1581 * work and much better kernel instruction scheduling.
1583 * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
1584 * which doesn't have FMA, both the analytical and tabulated Ewald
1585 * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
1586 * 2x(4+4) because it results in significantly fewer pairs.
1587 * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
1588 * 10% with HT, 50% without HT. As we currently don't detect the actual
1589 * use of HT, use 4x8 to avoid a potential performance hit.
1590 * On Intel Haswell 4x8 is always faster.
1592 *kernel_type = nbnxnk4xN_SIMD_4xN;
1594 #ifndef GMX_SIMD_HAVE_FMA
1595 if (EEL_PME(ir->coulombtype) || EEL_EWALD(ir->coulombtype) ||
1596 EVDW_PME(ir->vdwtype))
1598 /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
1599 * There are enough instructions to make 2x(4+4) efficient.
1601 *kernel_type = nbnxnk4xN_SIMD_2xNN;
1604 #endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
1607 if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
1609 #ifdef GMX_NBNXN_SIMD_4XN
1610 *kernel_type = nbnxnk4xN_SIMD_4xN;
1612 gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
1615 if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
1617 #ifdef GMX_NBNXN_SIMD_2XNN
1618 *kernel_type = nbnxnk4xN_SIMD_2xNN;
1620 gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
1624 /* Analytical Ewald exclusion correction is only an option in
1626 * Since table lookup's don't parallelize with SIMD, analytical
1627 * will probably always be faster for a SIMD width of 8 or more.
1628 * With FMA analytical is sometimes faster for a width if 4 as well.
1629 * On BlueGene/Q, this is faster regardless of precision.
1630 * In single precision, this is faster on Bulldozer.
1632 #if GMX_SIMD_REAL_WIDTH >= 8 || \
1633 (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
1634 defined GMX_SIMD_IBM_QPX
1635 *ewald_excl = ewaldexclAnalytical;
1637 if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
1639 *ewald_excl = ewaldexclTable;
1641 if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
1643 *ewald_excl = ewaldexclAnalytical;
1647 #endif /* GMX_NBNXN_SIMD */
1651 const char *lookup_nbnxn_kernel_name(int kernel_type)
1653 const char *returnvalue = NULL;
1654 switch (kernel_type)
1657 returnvalue = "not set";
1659 case nbnxnk4x4_PlainC:
1660 returnvalue = "plain C";
1662 case nbnxnk4xN_SIMD_4xN:
1663 case nbnxnk4xN_SIMD_2xNN:
1664 #ifdef GMX_NBNXN_SIMD
1665 #if defined GMX_SIMD_X86_SSE2
1666 returnvalue = "SSE2";
1667 #elif defined GMX_SIMD_X86_SSE4_1
1668 returnvalue = "SSE4.1";
1669 #elif defined GMX_SIMD_X86_AVX_128_FMA
1670 returnvalue = "AVX_128_FMA";
1671 #elif defined GMX_SIMD_X86_AVX_256
1672 returnvalue = "AVX_256";
1673 #elif defined GMX_SIMD_X86_AVX2_256
1674 returnvalue = "AVX2_256";
1676 returnvalue = "SIMD";
1678 #else /* GMX_NBNXN_SIMD */
1679 returnvalue = "not available";
1680 #endif /* GMX_NBNXN_SIMD */
1682 case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
1683 case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
1687 gmx_fatal(FARGS, "Illegal kernel type selected");
1694 static void pick_nbnxn_kernel(FILE *fp,
1695 const t_commrec *cr,
1696 gmx_bool use_simd_kernels,
1698 gmx_bool bEmulateGPU,
1699 const t_inputrec *ir,
1702 gmx_bool bDoNonbonded)
1704 assert(kernel_type);
1706 *kernel_type = nbnxnkNotSet;
1707 *ewald_excl = ewaldexclTable;
1711 *kernel_type = nbnxnk8x8x8_PlainC;
1715 md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
1720 *kernel_type = nbnxnk8x8x8_CUDA;
1723 if (*kernel_type == nbnxnkNotSet)
1725 /* LJ PME with LB combination rule does 7 mesh operations.
1726 * This so slow that we don't compile SIMD non-bonded kernels for that.
1728 if (use_simd_kernels &&
1729 nbnxn_acceleration_supported(fp, cr, ir, FALSE))
1731 pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
1735 *kernel_type = nbnxnk4x4_PlainC;
1739 if (bDoNonbonded && fp != NULL)
1741 fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
1742 lookup_nbnxn_kernel_name(*kernel_type),
1743 nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
1744 nbnxn_kernel_to_cj_size(*kernel_type));
1748 static void pick_nbnxn_resources(const t_commrec *cr,
1749 const gmx_hw_info_t *hwinfo,
1750 gmx_bool bDoNonbonded,
1752 gmx_bool *bEmulateGPU,
1753 const gmx_gpu_opt_t *gpu_opt)
1755 gmx_bool bEmulateGPUEnvVarSet;
1756 char gpu_err_str[STRLEN];
1760 bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
1762 /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
1763 * GPUs (currently) only handle non-bonded calculations, we will
1764 * automatically switch to emulation if non-bonded calculations are
1765 * turned off via GMX_NO_NONBONDED - this is the simple and elegant
1766 * way to turn off GPU initialization, data movement, and cleanup.
1768 * GPU emulation can be useful to assess the performance one can expect by
1769 * adding GPU(s) to the machine. The conditional below allows this even
1770 * if mdrun is compiled without GPU acceleration support.
1771 * Note that you should freezing the system as otherwise it will explode.
1773 *bEmulateGPU = (bEmulateGPUEnvVarSet ||
1775 gpu_opt->ncuda_dev_use > 0));
1777 /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
1779 if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
1781 /* Each PP node will use the intra-node id-th device from the
1782 * list of detected/selected GPUs. */
1783 if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
1784 &hwinfo->gpu_info, gpu_opt))
1786 /* At this point the init should never fail as we made sure that
1787 * we have all the GPUs we need. If it still does, we'll bail. */
1788 gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
1790 get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
1791 cr->rank_pp_intranode),
1795 /* Here we actually turn on hardware GPU acceleration */
1800 gmx_bool uses_simple_tables(int cutoff_scheme,
1801 nonbonded_verlet_t *nbv,
1804 gmx_bool bUsesSimpleTables = TRUE;
1807 switch (cutoff_scheme)
1810 bUsesSimpleTables = TRUE;
1813 assert(NULL != nbv && NULL != nbv->grp);
1814 grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
1815 bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
1818 gmx_incons("unimplemented");
1820 return bUsesSimpleTables;
1823 static void init_ewald_f_table(interaction_const_t *ic,
1824 gmx_bool bUsesSimpleTables,
1829 if (bUsesSimpleTables)
1831 /* With a spacing of 0.0005 we are at the force summation accuracy
1832 * for the SSE kernels for "normal" atomistic simulations.
1834 ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff_q,
1837 maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
1838 ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2;
1842 ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
1843 /* Subtract 2 iso 1 to avoid access out of range due to rounding */
1844 ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
1847 sfree_aligned(ic->tabq_coul_FDV0);
1848 sfree_aligned(ic->tabq_coul_F);
1849 sfree_aligned(ic->tabq_coul_V);
1851 /* Create the original table data in FDV0 */
1852 snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
1853 snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
1854 snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
1855 table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
1856 ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_q);
1859 void init_interaction_const_tables(FILE *fp,
1860 interaction_const_t *ic,
1861 gmx_bool bUsesSimpleTables,
1866 if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
1868 init_ewald_f_table(ic, bUsesSimpleTables, rtab);
1872 fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
1873 1/ic->tabq_scale, ic->tabq_size);
1878 static void clear_force_switch_constants(shift_consts_t *sc)
1885 static void force_switch_constants(real p,
1889 /* Here we determine the coefficient for shifting the force to zero
1890 * between distance rsw and the cut-off rc.
1891 * For a potential of r^-p, we have force p*r^-(p+1).
1892 * But to save flops we absorb p in the coefficient.
1894 * force/p = r^-(p+1) + c2*r^2 + c3*r^3
1895 * potential = r^-p + c2/3*r^3 + c3/4*r^4 + cpot
1897 sc->c2 = ((p + 1)*rsw - (p + 4)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 2));
1898 sc->c3 = -((p + 1)*rsw - (p + 3)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 3));
1899 sc->cpot = -pow(rc, -p) + p*sc->c2/3*pow(rc - rsw, 3) + p*sc->c3/4*pow(rc - rsw, 4);
1902 static void potential_switch_constants(real rsw, real rc,
1903 switch_consts_t *sc)
1905 /* The switch function is 1 at rsw and 0 at rc.
1906 * The derivative and second derivate are zero at both ends.
1907 * rsw = max(r - r_switch, 0)
1908 * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
1909 * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
1910 * force = force*dsw - potential*sw
1913 sc->c3 = -10*pow(rc - rsw, -3);
1914 sc->c4 = 15*pow(rc - rsw, -4);
1915 sc->c5 = -6*pow(rc - rsw, -5);
1919 init_interaction_const(FILE *fp,
1920 const t_commrec gmx_unused *cr,
1921 interaction_const_t **interaction_const,
1922 const t_forcerec *fr,
1925 interaction_const_t *ic;
1926 gmx_bool bUsesSimpleTables = TRUE;
1930 /* Just allocate something so we can free it */
1931 snew_aligned(ic->tabq_coul_FDV0, 16, 32);
1932 snew_aligned(ic->tabq_coul_F, 16, 32);
1933 snew_aligned(ic->tabq_coul_V, 16, 32);
1935 ic->rlist = fr->rlist;
1936 ic->rlistlong = fr->rlistlong;
1939 ic->vdwtype = fr->vdwtype;
1940 ic->vdw_modifier = fr->vdw_modifier;
1941 ic->rvdw = fr->rvdw;
1942 ic->rvdw_switch = fr->rvdw_switch;
1943 ic->ewaldcoeff_lj = fr->ewaldcoeff_lj;
1944 ic->ljpme_comb_rule = fr->ljpme_combination_rule;
1945 ic->sh_lj_ewald = 0;
1946 clear_force_switch_constants(&ic->dispersion_shift);
1947 clear_force_switch_constants(&ic->repulsion_shift);
1949 switch (ic->vdw_modifier)
1951 case eintmodPOTSHIFT:
1952 /* Only shift the potential, don't touch the force */
1953 ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
1954 ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0);
1955 if (EVDW_PME(ic->vdwtype))
1959 if (fr->cutoff_scheme == ecutsGROUP)
1961 gmx_fatal(FARGS, "Potential-shift is not (yet) implemented for LJ-PME with cutoff-scheme=group");
1963 crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw);
1964 ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
1967 case eintmodFORCESWITCH:
1968 /* Switch the force, switch and shift the potential */
1969 force_switch_constants(6.0, ic->rvdw_switch, ic->rvdw,
1970 &ic->dispersion_shift);
1971 force_switch_constants(12.0, ic->rvdw_switch, ic->rvdw,
1972 &ic->repulsion_shift);
1974 case eintmodPOTSWITCH:
1975 /* Switch the potential and force */
1976 potential_switch_constants(ic->rvdw_switch, ic->rvdw,
1980 case eintmodEXACTCUTOFF:
1981 /* Nothing to do here */
1984 gmx_incons("unimplemented potential modifier");
1987 ic->sh_invrc6 = -ic->dispersion_shift.cpot;
1989 /* Electrostatics */
1990 ic->eeltype = fr->eeltype;
1991 ic->coulomb_modifier = fr->coulomb_modifier;
1992 ic->rcoulomb = fr->rcoulomb;
1993 ic->epsilon_r = fr->epsilon_r;
1994 ic->epsfac = fr->epsfac;
1995 ic->ewaldcoeff_q = fr->ewaldcoeff_q;
1997 if (fr->coulomb_modifier == eintmodPOTSHIFT)
1999 ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
2006 /* Reaction-field */
2007 if (EEL_RF(ic->eeltype))
2009 ic->epsilon_rf = fr->epsilon_rf;
2010 ic->k_rf = fr->k_rf;
2011 ic->c_rf = fr->c_rf;
2015 /* For plain cut-off we might use the reaction-field kernels */
2016 ic->epsilon_rf = ic->epsilon_r;
2018 if (fr->coulomb_modifier == eintmodPOTSHIFT)
2020 ic->c_rf = 1/ic->rcoulomb;
2030 real dispersion_shift;
2032 dispersion_shift = ic->dispersion_shift.cpot;
2033 if (EVDW_PME(ic->vdwtype))
2035 dispersion_shift -= ic->sh_lj_ewald;
2037 fprintf(fp, "Potential shift: LJ r^-12: %.3e r^-6: %.3e",
2038 ic->repulsion_shift.cpot, dispersion_shift);
2040 if (ic->eeltype == eelCUT)
2042 fprintf(fp, ", Coulomb %.e", -ic->c_rf);
2044 else if (EEL_PME(ic->eeltype))
2046 fprintf(fp, ", Ewald %.3e", -ic->sh_ewald);
2051 *interaction_const = ic;
2053 if (fr->nbv != NULL && fr->nbv->bUseGPU)
2055 nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
2057 /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
2058 * also sharing texture references. To keep the code simple, we don't
2059 * treat texture references as shared resources, but this means that
2060 * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
2061 * Hence, to ensure that the non-bonded kernels don't start before all
2062 * texture binding operations are finished, we need to wait for all ranks
2063 * to arrive here before continuing.
2065 * Note that we could omit this barrier if GPUs are not shared (or
2066 * texture objects are used), but as this is initialization code, there
2067 * is not point in complicating things.
2069 #ifdef GMX_THREAD_MPI
2074 #endif /* GMX_THREAD_MPI */
2077 bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
2078 init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
2081 static void init_nb_verlet(FILE *fp,
2082 nonbonded_verlet_t **nb_verlet,
2083 const t_inputrec *ir,
2084 const t_forcerec *fr,
2085 const t_commrec *cr,
2086 const char *nbpu_opt)
2088 nonbonded_verlet_t *nbv;
2091 gmx_bool bEmulateGPU, bHybridGPURun = FALSE;
2093 nbnxn_alloc_t *nb_alloc;
2094 nbnxn_free_t *nb_free;
2098 pick_nbnxn_resources(cr, fr->hwinfo,
2106 nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
2107 for (i = 0; i < nbv->ngrp; i++)
2109 nbv->grp[i].nbl_lists.nnbl = 0;
2110 nbv->grp[i].nbat = NULL;
2111 nbv->grp[i].kernel_type = nbnxnkNotSet;
2113 if (i == 0) /* local */
2115 pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
2116 nbv->bUseGPU, bEmulateGPU, ir,
2117 &nbv->grp[i].kernel_type,
2118 &nbv->grp[i].ewald_excl,
2121 else /* non-local */
2123 if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
2125 /* Use GPU for local, select a CPU kernel for non-local */
2126 pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
2128 &nbv->grp[i].kernel_type,
2129 &nbv->grp[i].ewald_excl,
2132 bHybridGPURun = TRUE;
2136 /* Use the same kernel for local and non-local interactions */
2137 nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
2138 nbv->grp[i].ewald_excl = nbv->grp[0].ewald_excl;
2145 /* init the NxN GPU data; the last argument tells whether we'll have
2146 * both local and non-local NB calculation on GPU */
2147 nbnxn_cuda_init(fp, &nbv->cu_nbv,
2148 &fr->hwinfo->gpu_info, fr->gpu_opt,
2149 cr->rank_pp_intranode,
2150 (nbv->ngrp > 1) && !bHybridGPURun);
2152 if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
2156 nbv->min_ci_balanced = strtol(env, &end, 10);
2157 if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
2159 gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
2164 fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
2165 nbv->min_ci_balanced);
2170 nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
2173 fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
2174 nbv->min_ci_balanced);
2180 nbv->min_ci_balanced = 0;
2185 nbnxn_init_search(&nbv->nbs,
2186 DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
2187 DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
2188 gmx_omp_nthreads_get(emntNonbonded));
2190 for (i = 0; i < nbv->ngrp; i++)
2192 if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
2194 nb_alloc = &pmalloc;
2203 nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
2204 nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
2205 /* 8x8x8 "non-simple" lists are ATM always combined */
2206 !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
2210 nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
2212 gmx_bool bSimpleList;
2213 int enbnxninitcombrule;
2215 bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
2217 if (bSimpleList && (fr->vdwtype == evdwCUT && (fr->vdw_modifier == eintmodNONE || fr->vdw_modifier == eintmodPOTSHIFT)))
2219 /* Plain LJ cut-off: we can optimize with combination rules */
2220 enbnxninitcombrule = enbnxninitcombruleDETECT;
2222 else if (fr->vdwtype == evdwPME)
2224 /* LJ-PME: we need to use a combination rule for the grid */
2225 if (fr->ljpme_combination_rule == eljpmeGEOM)
2227 enbnxninitcombrule = enbnxninitcombruleGEOM;
2231 enbnxninitcombrule = enbnxninitcombruleLB;
2236 /* We use a full combination matrix: no rule required */
2237 enbnxninitcombrule = enbnxninitcombruleNONE;
2241 snew(nbv->grp[i].nbat, 1);
2242 nbnxn_atomdata_init(fp,
2244 nbv->grp[i].kernel_type,
2246 fr->ntype, fr->nbfp,
2248 bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
2253 nbv->grp[i].nbat = nbv->grp[0].nbat;
2258 void init_forcerec(FILE *fp,
2259 const output_env_t oenv,
2262 const t_inputrec *ir,
2263 const gmx_mtop_t *mtop,
2264 const t_commrec *cr,
2270 const char *nbpu_opt,
2271 gmx_bool bNoSolvOpt,
2274 int i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
2279 gmx_bool bGenericKernelOnly;
2280 gmx_bool bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
2282 int *nm_ind, egp_flags;
2284 if (fr->hwinfo == NULL)
2286 /* Detect hardware, gather information.
2287 * In mdrun, hwinfo has already been set before calling init_forcerec.
2288 * Here we ignore GPUs, as tools will not use them anyhow.
2290 fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
2293 /* By default we turn SIMD kernels on, but it might be turned off further down... */
2294 fr->use_simd_kernels = TRUE;
2296 fr->bDomDec = DOMAINDECOMP(cr);
2298 natoms = mtop->natoms;
2300 if (check_box(ir->ePBC, box))
2302 gmx_fatal(FARGS, check_box(ir->ePBC, box));
2305 /* Test particle insertion ? */
2308 /* Set to the size of the molecule to be inserted (the last one) */
2309 /* Because of old style topologies, we have to use the last cg
2310 * instead of the last molecule type.
2312 cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
2313 fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
2314 if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
2316 gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
2324 /* Copy AdResS parameters */
2327 fr->adress_type = ir->adress->type;
2328 fr->adress_const_wf = ir->adress->const_wf;
2329 fr->adress_ex_width = ir->adress->ex_width;
2330 fr->adress_hy_width = ir->adress->hy_width;
2331 fr->adress_icor = ir->adress->icor;
2332 fr->adress_site = ir->adress->site;
2333 fr->adress_ex_forcecap = ir->adress->ex_forcecap;
2334 fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
2337 snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
2338 for (i = 0; i < ir->adress->n_energy_grps; i++)
2340 fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
2343 fr->n_adress_tf_grps = ir->adress->n_tf_grps;
2344 snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
2345 for (i = 0; i < fr->n_adress_tf_grps; i++)
2347 fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
2349 copy_rvec(ir->adress->refs, fr->adress_refs);
2353 fr->adress_type = eAdressOff;
2354 fr->adress_do_hybridpairs = FALSE;
2357 /* Copy the user determined parameters */
2358 fr->userint1 = ir->userint1;
2359 fr->userint2 = ir->userint2;
2360 fr->userint3 = ir->userint3;
2361 fr->userint4 = ir->userint4;
2362 fr->userreal1 = ir->userreal1;
2363 fr->userreal2 = ir->userreal2;
2364 fr->userreal3 = ir->userreal3;
2365 fr->userreal4 = ir->userreal4;
2368 fr->fc_stepsize = ir->fc_stepsize;
2371 fr->efep = ir->efep;
2372 fr->sc_alphavdw = ir->fepvals->sc_alpha;
2373 if (ir->fepvals->bScCoul)
2375 fr->sc_alphacoul = ir->fepvals->sc_alpha;
2376 fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
2380 fr->sc_alphacoul = 0;
2381 fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
2383 fr->sc_power = ir->fepvals->sc_power;
2384 fr->sc_r_power = ir->fepvals->sc_r_power;
2385 fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
2387 env = getenv("GMX_SCSIGMA_MIN");
2391 sscanf(env, "%lf", &dbl);
2392 fr->sc_sigma6_min = pow(dbl, 6);
2395 fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
2399 fr->bNonbonded = TRUE;
2400 if (getenv("GMX_NO_NONBONDED") != NULL)
2402 /* turn off non-bonded calculations */
2403 fr->bNonbonded = FALSE;
2404 md_print_warn(cr, fp,
2405 "Found environment variable GMX_NO_NONBONDED.\n"
2406 "Disabling nonbonded calculations.\n");
2409 bGenericKernelOnly = FALSE;
2411 /* We now check in the NS code whether a particular combination of interactions
2412 * can be used with water optimization, and disable it if that is not the case.
2415 if (getenv("GMX_NB_GENERIC") != NULL)
2420 "Found environment variable GMX_NB_GENERIC.\n"
2421 "Disabling all interaction-specific nonbonded kernels, will only\n"
2422 "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
2424 bGenericKernelOnly = TRUE;
2427 if (bGenericKernelOnly == TRUE)
2432 if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
2434 fr->use_simd_kernels = FALSE;
2438 "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
2439 "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
2443 fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
2445 /* Check if we can/should do all-vs-all kernels */
2446 fr->bAllvsAll = can_use_allvsall(ir, FALSE, NULL, NULL);
2447 fr->AllvsAll_work = NULL;
2448 fr->AllvsAll_workgb = NULL;
2450 /* All-vs-all kernels have not been implemented in 4.6, and
2451 * the SIMD group kernels are also buggy in this case. Non-SIMD
2452 * group kernels are OK. See Redmine #1249. */
2455 fr->bAllvsAll = FALSE;
2456 fr->use_simd_kernels = FALSE;
2460 "\nYour simulation settings would have triggered the efficient all-vs-all\n"
2461 "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
2462 "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
2463 "of an unfixed bug. The reference C kernels are correct, though, so\n"
2464 "we are proceeding by disabling all CPU architecture-specific\n"
2465 "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
2466 "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
2470 /* Neighbour searching stuff */
2471 fr->cutoff_scheme = ir->cutoff_scheme;
2472 fr->bGrid = (ir->ns_type == ensGRID);
2473 fr->ePBC = ir->ePBC;
2475 if (fr->cutoff_scheme == ecutsGROUP)
2477 const char *note = "NOTE: This file uses the deprecated 'group' cutoff_scheme. This will be\n"
2478 "removed in a future release when 'verlet' supports all interaction forms.\n";
2482 fprintf(stderr, "\n%s\n", note);
2486 fprintf(fp, "\n%s\n", note);
2490 /* Determine if we will do PBC for distances in bonded interactions */
2491 if (fr->ePBC == epbcNONE)
2493 fr->bMolPBC = FALSE;
2497 if (!DOMAINDECOMP(cr))
2499 /* The group cut-off scheme and SHAKE assume charge groups
2500 * are whole, but not using molpbc is faster in most cases.
2502 if (fr->cutoff_scheme == ecutsGROUP ||
2503 (ir->eConstrAlg == econtSHAKE &&
2504 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
2505 gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
2507 fr->bMolPBC = ir->bPeriodicMols;
2512 if (getenv("GMX_USE_GRAPH") != NULL)
2514 fr->bMolPBC = FALSE;
2517 fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
2524 fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
2527 fr->bGB = (ir->implicit_solvent == eisGBSA);
2529 fr->rc_scaling = ir->refcoord_scaling;
2530 copy_rvec(ir->posres_com, fr->posres_com);
2531 copy_rvec(ir->posres_comB, fr->posres_comB);
2532 fr->rlist = cutoff_inf(ir->rlist);
2533 fr->rlistlong = cutoff_inf(ir->rlistlong);
2534 fr->eeltype = ir->coulombtype;
2535 fr->vdwtype = ir->vdwtype;
2536 fr->ljpme_combination_rule = ir->ljpme_combination_rule;
2538 fr->coulomb_modifier = ir->coulomb_modifier;
2539 fr->vdw_modifier = ir->vdw_modifier;
2541 /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
2542 switch (fr->eeltype)
2545 fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
2551 fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
2555 fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
2556 fr->coulomb_modifier = eintmodEXACTCUTOFF;
2565 case eelPMEUSERSWITCH:
2566 fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
2571 fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
2575 gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
2579 /* Vdw: Translate from mdp settings to kernel format */
2580 switch (fr->vdwtype)
2586 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
2590 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
2597 case evdwENCADSHIFT:
2598 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
2602 gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
2606 /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
2607 fr->nbkernel_elec_modifier = fr->coulomb_modifier;
2608 fr->nbkernel_vdw_modifier = fr->vdw_modifier;
2610 fr->bTwinRange = fr->rlistlong > fr->rlist;
2611 fr->bEwald = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
2613 fr->reppow = mtop->ffparams.reppow;
2615 if (ir->cutoff_scheme == ecutsGROUP)
2617 fr->bvdwtab = (fr->vdwtype != evdwCUT ||
2618 !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
2619 /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
2620 fr->bcoultab = !(fr->eeltype == eelCUT ||
2621 fr->eeltype == eelEWALD ||
2622 fr->eeltype == eelPME ||
2623 fr->eeltype == eelRF ||
2624 fr->eeltype == eelRF_ZERO);
2626 /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
2627 * going to be faster to tabulate the interaction than calling the generic kernel.
2629 if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
2631 if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
2633 fr->bcoultab = TRUE;
2636 else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
2637 ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
2638 fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
2639 (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
2641 if (fr->rcoulomb != fr->rvdw)
2643 fr->bcoultab = TRUE;
2647 if (getenv("GMX_REQUIRE_TABLES"))
2650 fr->bcoultab = TRUE;
2655 fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
2656 fprintf(fp, "Table routines are used for vdw: %s\n", bool_names[fr->bvdwtab ]);
2659 if (fr->bvdwtab == TRUE)
2661 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
2662 fr->nbkernel_vdw_modifier = eintmodNONE;
2664 if (fr->bcoultab == TRUE)
2666 fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
2667 fr->nbkernel_elec_modifier = eintmodNONE;
2671 if (ir->cutoff_scheme == ecutsVERLET)
2673 if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
2675 gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
2677 fr->bvdwtab = FALSE;
2678 fr->bcoultab = FALSE;
2681 /* Tables are used for direct ewald sum */
2684 if (EEL_PME(ir->coulombtype))
2688 fprintf(fp, "Will do PME sum in reciprocal space for electrostatic interactions.\n");
2690 if (ir->coulombtype == eelP3M_AD)
2692 please_cite(fp, "Hockney1988");
2693 please_cite(fp, "Ballenegger2012");
2697 please_cite(fp, "Essmann95a");
2700 if (ir->ewald_geometry == eewg3DC)
2704 fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
2706 please_cite(fp, "In-Chul99a");
2709 fr->ewaldcoeff_q = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol);
2710 init_ewald_tab(&(fr->ewald_table), ir, fp);
2713 fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
2714 1/fr->ewaldcoeff_q);
2718 if (EVDW_PME(ir->vdwtype))
2722 fprintf(fp, "Will do PME sum in reciprocal space for LJ dispersion interactions.\n");
2724 please_cite(fp, "Essmann95a");
2725 fr->ewaldcoeff_lj = calc_ewaldcoeff_lj(ir->rvdw, ir->ewald_rtol_lj);
2728 fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for LJ Ewald\n",
2729 1/fr->ewaldcoeff_lj);
2733 /* Electrostatics */
2734 fr->epsilon_r = ir->epsilon_r;
2735 fr->epsilon_rf = ir->epsilon_rf;
2736 fr->fudgeQQ = mtop->ffparams.fudgeQQ;
2737 fr->rcoulomb_switch = ir->rcoulomb_switch;
2738 fr->rcoulomb = cutoff_inf(ir->rcoulomb);
2740 /* Parameters for generalized RF */
2744 if (fr->eeltype == eelGRF)
2746 init_generalized_rf(fp, mtop, ir, fr);
2749 fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype) ||
2750 gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
2751 gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
2752 IR_ELEC_FIELD(*ir) ||
2753 (fr->adress_icor != eAdressICOff)
2756 if (fr->cutoff_scheme == ecutsGROUP &&
2757 ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
2759 /* Count the total number of charge groups */
2760 fr->cg_nalloc = ncg_mtop(mtop);
2761 srenew(fr->cg_cm, fr->cg_nalloc);
2763 if (fr->shift_vec == NULL)
2765 snew(fr->shift_vec, SHIFTS);
2768 if (fr->fshift == NULL)
2770 snew(fr->fshift, SHIFTS);
2773 if (fr->nbfp == NULL)
2775 fr->ntype = mtop->ffparams.atnr;
2776 fr->nbfp = mk_nbfp(&mtop->ffparams, fr->bBHAM);
2779 /* Copy the energy group exclusions */
2780 fr->egp_flags = ir->opts.egp_flags;
2782 /* Van der Waals stuff */
2783 fr->rvdw = cutoff_inf(ir->rvdw);
2784 fr->rvdw_switch = ir->rvdw_switch;
2785 if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
2787 if (fr->rvdw_switch >= fr->rvdw)
2789 gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
2790 fr->rvdw_switch, fr->rvdw);
2794 fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
2795 (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
2796 fr->rvdw_switch, fr->rvdw);
2800 if (fr->bBHAM && EVDW_PME(fr->vdwtype))
2802 gmx_fatal(FARGS, "LJ PME not supported with Buckingham");
2805 if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
2807 gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
2812 fprintf(fp, "Cut-off's: NS: %g Coulomb: %g %s: %g\n",
2813 fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
2816 fr->eDispCorr = ir->eDispCorr;
2817 if (ir->eDispCorr != edispcNO)
2819 set_avcsixtwelve(fp, fr, mtop);
2824 set_bham_b_max(fp, fr, mtop);
2827 fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
2829 /* Copy the GBSA data (radius, volume and surftens for each
2830 * atomtype) from the topology atomtype section to forcerec.
2832 snew(fr->atype_radius, fr->ntype);
2833 snew(fr->atype_vol, fr->ntype);
2834 snew(fr->atype_surftens, fr->ntype);
2835 snew(fr->atype_gb_radius, fr->ntype);
2836 snew(fr->atype_S_hct, fr->ntype);
2838 if (mtop->atomtypes.nr > 0)
2840 for (i = 0; i < fr->ntype; i++)
2842 fr->atype_radius[i] = mtop->atomtypes.radius[i];
2844 for (i = 0; i < fr->ntype; i++)
2846 fr->atype_vol[i] = mtop->atomtypes.vol[i];
2848 for (i = 0; i < fr->ntype; i++)
2850 fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
2852 for (i = 0; i < fr->ntype; i++)
2854 fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
2856 for (i = 0; i < fr->ntype; i++)
2858 fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
2862 /* Generate the GB table if needed */
2866 fr->gbtabscale = 2000;
2868 fr->gbtabscale = 500;
2872 fr->gbtab = make_gb_table(oenv, fr);
2874 init_gb(&fr->born, fr, ir, mtop, ir->gb_algorithm);
2876 /* Copy local gb data (for dd, this is done in dd_partition_system) */
2877 if (!DOMAINDECOMP(cr))
2879 make_local_gb(cr, fr->born, ir->gb_algorithm);
2883 /* Set the charge scaling */
2884 if (fr->epsilon_r != 0)
2886 fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
2890 /* eps = 0 is infinite dieletric: no coulomb interactions */
2894 /* Reaction field constants */
2895 if (EEL_RF(fr->eeltype))
2897 calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
2898 fr->rcoulomb, fr->temp, fr->zsquare, box,
2899 &fr->kappa, &fr->k_rf, &fr->c_rf);
2902 /*This now calculates sum for q and c6*/
2903 set_chargesum(fp, fr, mtop);
2905 /* if we are using LR electrostatics, and they are tabulated,
2906 * the tables will contain modified coulomb interactions.
2907 * Since we want to use the non-shifted ones for 1-4
2908 * coulombic interactions, we must have an extra set of tables.
2911 /* Construct tables.
2912 * A little unnecessary to make both vdw and coul tables sometimes,
2913 * but what the heck... */
2915 bMakeTables = fr->bcoultab || fr->bvdwtab || fr->bEwald ||
2916 (ir->eDispCorr != edispcNO && ir_vdw_switched(ir));
2918 bMakeSeparate14Table = ((!bMakeTables || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
2919 fr->bBHAM || fr->bEwald) &&
2920 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
2921 gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
2922 gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
2924 negp_pp = ir->opts.ngener - ir->nwall;
2928 bSomeNormalNbListsAreInUse = TRUE;
2933 bSomeNormalNbListsAreInUse = (ir->eDispCorr != edispcNO);
2934 for (egi = 0; egi < negp_pp; egi++)
2936 for (egj = egi; egj < negp_pp; egj++)
2938 egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
2939 if (!(egp_flags & EGP_EXCL))
2941 if (egp_flags & EGP_TABLE)
2947 bSomeNormalNbListsAreInUse = TRUE;
2952 if (bSomeNormalNbListsAreInUse)
2954 fr->nnblists = negptable + 1;
2958 fr->nnblists = negptable;
2960 if (fr->nnblists > 1)
2962 snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
2971 snew(fr->nblists, fr->nnblists);
2973 /* This code automatically gives table length tabext without cut-off's,
2974 * in that case grompp should already have checked that we do not need
2975 * normal tables and we only generate tables for 1-4 interactions.
2977 rtab = ir->rlistlong + ir->tabext;
2981 /* make tables for ordinary interactions */
2982 if (bSomeNormalNbListsAreInUse)
2984 make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
2987 make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
2989 if (!bMakeSeparate14Table)
2991 fr->tab14 = fr->nblists[0].table_elec_vdw;
3001 /* Read the special tables for certain energy group pairs */
3002 nm_ind = mtop->groups.grps[egcENER].nm_ind;
3003 for (egi = 0; egi < negp_pp; egi++)
3005 for (egj = egi; egj < negp_pp; egj++)
3007 egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
3008 if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
3010 nbl = &(fr->nblists[m]);
3011 if (fr->nnblists > 1)
3013 fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
3015 /* Read the table file with the two energy groups names appended */
3016 make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
3017 *mtop->groups.grpname[nm_ind[egi]],
3018 *mtop->groups.grpname[nm_ind[egj]],
3022 make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
3023 *mtop->groups.grpname[nm_ind[egi]],
3024 *mtop->groups.grpname[nm_ind[egj]],
3025 &fr->nblists[fr->nnblists/2+m]);
3029 else if (fr->nnblists > 1)
3031 fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
3037 if (bMakeSeparate14Table)
3039 /* generate extra tables with plain Coulomb for 1-4 interactions only */
3040 fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
3041 GMX_MAKETABLES_14ONLY);
3044 /* Read AdResS Thermo Force table if needed */
3045 if (fr->adress_icor == eAdressICThermoForce)
3047 /* old todo replace */
3049 if (ir->adress->n_tf_grps > 0)
3051 make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
3056 /* load the default table */
3057 snew(fr->atf_tabs, 1);
3058 fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
3063 fr->nwall = ir->nwall;
3064 if (ir->nwall && ir->wall_type == ewtTABLE)
3066 make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
3071 fcd->bondtab = make_bonded_tables(fp,
3072 F_TABBONDS, F_TABBONDSNC,
3074 fcd->angletab = make_bonded_tables(fp,
3077 fcd->dihtab = make_bonded_tables(fp,
3085 fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
3089 /* QM/MM initialization if requested
3093 fprintf(stderr, "QM/MM calculation requested.\n");
3096 fr->bQMMM = ir->bQMMM;
3097 fr->qr = mk_QMMMrec();
3099 /* Set all the static charge group info */
3100 fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
3101 &fr->bExcl_IntraCGAll_InterCGNone);
3102 if (DOMAINDECOMP(cr))
3108 fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
3111 if (!DOMAINDECOMP(cr))
3113 forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
3114 mtop->natoms, mtop->natoms, mtop->natoms);
3117 fr->print_force = print_force;
3120 /* coarse load balancing vars */
3125 /* Initialize neighbor search */
3126 init_ns(fp, cr, &fr->ns, fr, mtop);
3128 if (cr->duty & DUTY_PP)
3130 gmx_nonbonded_setup(fr, bGenericKernelOnly);
3134 gmx_setup_adress_kernels(fp,bGenericKernelOnly);
3139 /* Initialize the thread working data for bonded interactions */
3140 init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
3142 snew(fr->excl_load, fr->nthreads+1);
3144 if (fr->cutoff_scheme == ecutsVERLET)
3146 if (ir->rcoulomb != ir->rvdw)
3148 gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
3151 init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
3154 /* fr->ic is used both by verlet and group kernels (to some extent) now */
3155 init_interaction_const(fp, cr, &fr->ic, fr, rtab);
3157 if (ir->eDispCorr != edispcNO)
3159 calc_enervirdiff(fp, ir->eDispCorr, fr);
3163 #define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
3164 #define pr_int(fp, i) fprintf((fp), "%s: %d\n",#i, i)
3165 #define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
3167 void pr_forcerec(FILE *fp, t_forcerec *fr)
3171 pr_real(fp, fr->rlist);
3172 pr_real(fp, fr->rcoulomb);
3173 pr_real(fp, fr->fudgeQQ);
3174 pr_bool(fp, fr->bGrid);
3175 pr_bool(fp, fr->bTwinRange);
3176 /*pr_int(fp,fr->cg0);
3177 pr_int(fp,fr->hcg);*/
3178 for (i = 0; i < fr->nnblists; i++)
3180 pr_int(fp, fr->nblists[i].table_elec_vdw.n);
3182 pr_real(fp, fr->rcoulomb_switch);
3183 pr_real(fp, fr->rcoulomb);
3188 void forcerec_set_excl_load(t_forcerec *fr,
3189 const gmx_localtop_t *top)
3192 int t, i, j, ntot, n, ntarget;
3194 ind = top->excls.index;
3198 for (i = 0; i < top->excls.nr; i++)
3200 for (j = ind[i]; j < ind[i+1]; j++)
3209 fr->excl_load[0] = 0;
3212 for (t = 1; t <= fr->nthreads; t++)
3214 ntarget = (ntot*t)/fr->nthreads;
3215 while (i < top->excls.nr && n < ntarget)
3217 for (j = ind[i]; j < ind[i+1]; j++)
3226 fr->excl_load[t] = i;