src/gromacs/mdlib/forcerec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43 #include <assert.h>
  44 #include "sysstuff.h"
  45 #include "typedefs.h"
  46 #include "types/commrec.h"
  47 #include "vec.h"
  48 #include "gromacs/math/utilities.h"
  49 #include "macros.h"
  50 #include "smalloc.h"
  51 #include "macros.h"
  52 #include "gmx_fatal.h"
  53 #include "gmx_fatal_collective.h"
  54 #include "physics.h"
  55 #include "force.h"
  56 #include "tables.h"
  57 #include "nonbonded.h"
  58 #include "invblock.h"
  59 #include "names.h"
  60 #include "network.h"
  61 #include "pbc.h"
  62 #include "ns.h"
  63 #include "mshift.h"
  64 #include "txtdump.h"
  65 #include "coulomb.h"
  66 #include "md_support.h"
  67 #include "md_logging.h"
  68 #include "domdec.h"
  69 #include "qmmm.h"
  70 #include "copyrite.h"
  71 #include "mtop_util.h"
  72 #include "nbnxn_simd.h"
  73 #include "nbnxn_search.h"
  74 #include "nbnxn_atomdata.h"
  75 #include "nbnxn_consts.h"
  76 #include "gmx_omp_nthreads.h"
  77 #include "gmx_detect_hardware.h"
  78 #include "inputrec.h"
  79
  80 #ifdef _MSC_VER
  81 /* MSVC definition for __cpuid() */
  82 #include <intrin.h>
  83 #endif
  84
  85 #include "types/nbnxn_cuda_types_ext.h"
  86 #include "gpu_utils.h"
  87 #include "nbnxn_cuda_data_mgmt.h"
  88 #include "pmalloc_cuda.h"
  89
  90 t_forcerec *mk_forcerec(void)
  91 {
  92     t_forcerec *fr;
  93
  94     snew(fr, 1);
  95
  96     return fr;
  97 }
  98
  99 #ifdef DEBUG
 100 static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
 101 {
 102     int i, j;
 103
 104     for (i = 0; (i < atnr); i++)
 105     {
 106         for (j = 0; (j < atnr); j++)
 107         {
 108             fprintf(fp, "%2d - %2d", i, j);
 109             if (bBHAM)
 110             {
 111                 fprintf(fp, "  a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
 112                         BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
 113             }
 114             else
 115             {
 116                 fprintf(fp, "  c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
 117                         C12(nbfp, atnr, i, j)/12.0);
 118             }
 119         }
 120     }
 121 }
 122 #endif
 123
 124 static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
 125 {
 126     real *nbfp;
 127     int   i, j, k, atnr;
 128
 129     atnr = idef->atnr;
 130     if (bBHAM)
 131     {
 132         snew(nbfp, 3*atnr*atnr);
 133         for (i = k = 0; (i < atnr); i++)
 134         {
 135             for (j = 0; (j < atnr); j++, k++)
 136             {
 137                 BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
 138                 BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
 139                 /* nbfp now includes the 6.0 derivative prefactor */
 140                 BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
 141             }
 142         }
 143     }
 144     else
 145     {
 146         snew(nbfp, 2*atnr*atnr);
 147         for (i = k = 0; (i < atnr); i++)
 148         {
 149             for (j = 0; (j < atnr); j++, k++)
 150             {
 151                 /* nbfp now includes the 6.0/12.0 derivative prefactors */
 152                 C6(nbfp, atnr, i, j)   = idef->iparams[k].lj.c6*6.0;
 153                 C12(nbfp, atnr, i, j)  = idef->iparams[k].lj.c12*12.0;
 154             }
 155         }
 156     }
 157
 158     return nbfp;
 159 }
 160
 161 static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
 162 {
 163     int   i, j, k, atnr;
 164     real  c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
 165     real *grid;
 166
 167     /* For LJ-PME simulations, we correct the energies with the reciprocal space
 168      * inside of the cut-off. To do this the non-bonded kernels needs to have
 169      * access to the C6-values used on the reciprocal grid in pme.c
 170      */
 171
 172     atnr = idef->atnr;
 173     snew(grid, 2*atnr*atnr);
 174     for (i = k = 0; (i < atnr); i++)
 175     {
 176         for (j = 0; (j < atnr); j++, k++)
 177         {
 178             c6i  = idef->iparams[i*(atnr+1)].lj.c6;
 179             c12i = idef->iparams[i*(atnr+1)].lj.c12;
 180             c6j  = idef->iparams[j*(atnr+1)].lj.c6;
 181             c12j = idef->iparams[j*(atnr+1)].lj.c12;
 182             c6   = sqrt(c6i * c6j);
 183             if (fr->ljpme_combination_rule == eljpmeLB
 184                 && !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
 185             {
 186                 sigmai = pow(c12i / c6i, 1.0/6.0);
 187                 sigmaj = pow(c12j / c6j, 1.0/6.0);
 188                 epsi   = c6i * c6i / c12i;
 189                 epsj   = c6j * c6j / c12j;
 190                 c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
 191             }
 192             /* Store the elements at the same relative positions as C6 in nbfp in order
 193              * to simplify access in the kernels
 194              */
 195             grid[2*(atnr*i+j)] = c6*6.0;
 196         }
 197     }
 198     return grid;
 199 }
 200
 201 static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
 202 {
 203     real *nbfp;
 204     int   i, j, k, atnr;
 205     real  c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
 206     real  c6, c12;
 207
 208     atnr = idef->atnr;
 209     snew(nbfp, 2*atnr*atnr);
 210     for (i = 0; i < atnr; ++i)
 211     {
 212         for (j = 0; j < atnr; ++j)
 213         {
 214             c6i  = idef->iparams[i*(atnr+1)].lj.c6;
 215             c12i = idef->iparams[i*(atnr+1)].lj.c12;
 216             c6j  = idef->iparams[j*(atnr+1)].lj.c6;
 217             c12j = idef->iparams[j*(atnr+1)].lj.c12;
 218             c6   = sqrt(c6i  * c6j);
 219             c12  = sqrt(c12i * c12j);
 220             if (comb_rule == eCOMB_ARITHMETIC
 221                 && !gmx_numzero(c6) && !gmx_numzero(c12))
 222             {
 223                 sigmai = pow(c12i / c6i, 1.0/6.0);
 224                 sigmaj = pow(c12j / c6j, 1.0/6.0);
 225                 epsi   = c6i * c6i / c12i;
 226                 epsj   = c6j * c6j / c12j;
 227                 c6     = epsi * epsj * pow(0.5*(sigmai+sigmaj), 6);
 228                 c12    = epsi * epsj * pow(0.5*(sigmai+sigmaj), 12);
 229             }
 230             C6(nbfp, atnr, i, j)   = c6*6.0;
 231             C12(nbfp, atnr, i, j)  = c12*12.0;
 232         }
 233     }
 234     return nbfp;
 235 }
 236
 237 /* This routine sets fr->solvent_opt to the most common solvent in the
 238  * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
 239  * the fr->solvent_type array with the correct type (or esolNO).
 240  *
 241  * Charge groups that fulfill the conditions but are not identical to the
 242  * most common one will be marked as esolNO in the solvent_type array.
 243  *
 244  * TIP3p is identical to SPC for these purposes, so we call it
 245  * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 246  *
 247  * NOTE: QM particle should not
 248  * become an optimized solvent. Not even if there is only one charge
 249  * group in the Qm
 250  */
 251
 252 typedef struct
 253 {
 254     int    model;
 255     int    count;
 256     int    vdwtype[4];
 257     real   charge[4];
 258 } solvent_parameters_t;
 259
 260 static void
 261 check_solvent_cg(const gmx_moltype_t    *molt,
 262                  int                     cg0,
 263                  int                     nmol,
 264                  const unsigned char    *qm_grpnr,
 265                  const t_grps           *qm_grps,
 266                  t_forcerec   *          fr,
 267                  int                    *n_solvent_parameters,
 268                  solvent_parameters_t  **solvent_parameters_p,
 269                  int                     cginfo,
 270                  int                    *cg_sp)
 271 {
 272     const t_blocka       *excl;
 273     t_atom               *atom;
 274     int                   j, k;
 275     int                   j0, j1, nj;
 276     gmx_bool              perturbed;
 277     gmx_bool              has_vdw[4];
 278     gmx_bool              match;
 279     real                  tmp_charge[4]  = { 0.0 }; /* init to zero to make gcc4.8 happy */
 280     int                   tmp_vdwtype[4] = { 0 };   /* init to zero to make gcc4.8 happy */
 281     int                   tjA;
 282     gmx_bool              qm;
 283     solvent_parameters_t *solvent_parameters;
 284
 285     /* We use a list with parameters for each solvent type.
 286      * Every time we discover a new molecule that fulfills the basic
 287      * conditions for a solvent we compare with the previous entries
 288      * in these lists. If the parameters are the same we just increment
 289      * the counter for that type, and otherwise we create a new type
 290      * based on the current molecule.
 291      *
 292      * Once we've finished going through all molecules we check which
 293      * solvent is most common, and mark all those molecules while we
 294      * clear the flag on all others.
 295      */
 296
 297     solvent_parameters = *solvent_parameters_p;
 298
 299     /* Mark the cg first as non optimized */
 300     *cg_sp = -1;
 301
 302     /* Check if this cg has no exclusions with atoms in other charge groups
 303      * and all atoms inside the charge group excluded.
 304      * We only have 3 or 4 atom solvent loops.
 305      */
 306     if (GET_CGINFO_EXCL_INTER(cginfo) ||
 307         !GET_CGINFO_EXCL_INTRA(cginfo))
 308     {
 309         return;
 310     }
 311
 312     /* Get the indices of the first atom in this charge group */
 313     j0     = molt->cgs.index[cg0];
 314     j1     = molt->cgs.index[cg0+1];
 315
 316     /* Number of atoms in our molecule */
 317     nj     = j1 - j0;
 318
 319     if (debug)
 320     {
 321         fprintf(debug,
 322                 "Moltype '%s': there are %d atoms in this charge group\n",
 323                 *molt->name, nj);
 324     }
 325
 326     /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 327      * otherwise skip it.
 328      */
 329     if (nj < 3 || nj > 4)
 330     {
 331         return;
 332     }
 333
 334     /* Check if we are doing QM on this group */
 335     qm = FALSE;
 336     if (qm_grpnr != NULL)
 337     {
 338         for (j = j0; j < j1 && !qm; j++)
 339         {
 340             qm = (qm_grpnr[j] < qm_grps->nr - 1);
 341         }
 342     }
 343     /* Cannot use solvent optimization with QM */
 344     if (qm)
 345     {
 346         return;
 347     }
 348
 349     atom = molt->atoms.atom;
 350
 351     /* Still looks like a solvent, time to check parameters */
 352
 353     /* If it is perturbed (free energy) we can't use the solvent loops,
 354      * so then we just skip to the next molecule.
 355      */
 356     perturbed = FALSE;
 357
 358     for (j = j0; j < j1 && !perturbed; j++)
 359     {
 360         perturbed = PERTURBED(atom[j]);
 361     }
 362
 363     if (perturbed)
 364     {
 365         return;
 366     }
 367
 368     /* Now it's only a question if the VdW and charge parameters
 369      * are OK. Before doing the check we compare and see if they are
 370      * identical to a possible previous solvent type.
 371      * First we assign the current types and charges.
 372      */
 373     for (j = 0; j < nj; j++)
 374     {
 375         tmp_vdwtype[j] = atom[j0+j].type;
 376         tmp_charge[j]  = atom[j0+j].q;
 377     }
 378
 379     /* Does it match any previous solvent type? */
 380     for (k = 0; k < *n_solvent_parameters; k++)
 381     {
 382         match = TRUE;
 383
 384
 385         /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 386         if ( (solvent_parameters[k].model == esolSPC   && nj != 3)  ||
 387              (solvent_parameters[k].model == esolTIP4P && nj != 4) )
 388         {
 389             match = FALSE;
 390         }
 391
 392         /* Check that types & charges match for all atoms in molecule */
 393         for (j = 0; j < nj && match == TRUE; j++)
 394         {
 395             if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 396             {
 397                 match = FALSE;
 398             }
 399             if (tmp_charge[j] != solvent_parameters[k].charge[j])
 400             {
 401                 match = FALSE;
 402             }
 403         }
 404         if (match == TRUE)
 405         {
 406             /* Congratulations! We have a matched solvent.
 407              * Flag it with this type for later processing.
 408              */
 409             *cg_sp = k;
 410             solvent_parameters[k].count += nmol;
 411
 412             /* We are done with this charge group */
 413             return;
 414         }
 415     }
 416
 417     /* If we get here, we have a tentative new solvent type.
 418      * Before we add it we must check that it fulfills the requirements
 419      * of the solvent optimized loops. First determine which atoms have
 420      * VdW interactions.
 421      */
 422     for (j = 0; j < nj; j++)
 423     {
 424         has_vdw[j] = FALSE;
 425         tjA        = tmp_vdwtype[j];
 426
 427         /* Go through all other tpes and see if any have non-zero
 428          * VdW parameters when combined with this one.
 429          */
 430         for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
 431         {
 432             /* We already checked that the atoms weren't perturbed,
 433              * so we only need to check state A now.
 434              */
 435             if (fr->bBHAM)
 436             {
 437                 has_vdw[j] = (has_vdw[j] ||
 438                               (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 439                               (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
 440                               (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 441             }
 442             else
 443             {
 444                 /* Standard LJ */
 445                 has_vdw[j] = (has_vdw[j] ||
 446                               (C6(fr->nbfp, fr->ntype, tjA, k)  != 0.0) ||
 447                               (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
 448             }
 449         }
 450     }
 451
 452     /* Now we know all we need to make the final check and assignment. */
 453     if (nj == 3)
 454     {
 455         /* So, is it an SPC?
 456          * For this we require thatn all atoms have charge,
 457          * the charges on atom 2 & 3 should be the same, and only
 458          * atom 1 might have VdW.
 459          */
 460         if (has_vdw[1] == FALSE &&
 461             has_vdw[2] == FALSE &&
 462             tmp_charge[0]  != 0 &&
 463             tmp_charge[1]  != 0 &&
 464             tmp_charge[2]  == tmp_charge[1])
 465         {
 466             srenew(solvent_parameters, *n_solvent_parameters+1);
 467             solvent_parameters[*n_solvent_parameters].model = esolSPC;
 468             solvent_parameters[*n_solvent_parameters].count = nmol;
 469             for (k = 0; k < 3; k++)
 470             {
 471                 solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 472                 solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 473             }
 474
 475             *cg_sp = *n_solvent_parameters;
 476             (*n_solvent_parameters)++;
 477         }
 478     }
 479     else if (nj == 4)
 480     {
 481         /* Or could it be a TIP4P?
 482          * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
 483          * Only atom 1 mght have VdW.
 484          */
 485         if (has_vdw[1] == FALSE &&
 486             has_vdw[2] == FALSE &&
 487             has_vdw[3] == FALSE &&
 488             tmp_charge[0]  == 0 &&
 489             tmp_charge[1]  != 0 &&
 490             tmp_charge[2]  == tmp_charge[1] &&
 491             tmp_charge[3]  != 0)
 492         {
 493             srenew(solvent_parameters, *n_solvent_parameters+1);
 494             solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 495             solvent_parameters[*n_solvent_parameters].count = nmol;
 496             for (k = 0; k < 4; k++)
 497             {
 498                 solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 499                 solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 500             }
 501
 502             *cg_sp = *n_solvent_parameters;
 503             (*n_solvent_parameters)++;
 504         }
 505     }
 506
 507     *solvent_parameters_p = solvent_parameters;
 508 }
 509
 510 static void
 511 check_solvent(FILE  *                fp,
 512               const gmx_mtop_t  *    mtop,
 513               t_forcerec  *          fr,
 514               cginfo_mb_t           *cginfo_mb)
 515 {
 516     const t_block     *   cgs;
 517     const t_block     *   mols;
 518     const gmx_moltype_t  *molt;
 519     int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
 520     int                   n_solvent_parameters;
 521     solvent_parameters_t *solvent_parameters;
 522     int                 **cg_sp;
 523     int                   bestsp, bestsol;
 524
 525     if (debug)
 526     {
 527         fprintf(debug, "Going to determine what solvent types we have.\n");
 528     }
 529
 530     mols = &mtop->mols;
 531
 532     n_solvent_parameters = 0;
 533     solvent_parameters   = NULL;
 534     /* Allocate temporary array for solvent type */
 535     snew(cg_sp, mtop->nmolblock);
 536
 537     cg_offset = 0;
 538     at_offset = 0;
 539     for (mb = 0; mb < mtop->nmolblock; mb++)
 540     {
 541         molt = &mtop->moltype[mtop->molblock[mb].type];
 542         cgs  = &molt->cgs;
 543         /* Here we have to loop over all individual molecules
 544          * because we need to check for QMMM particles.
 545          */
 546         snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
 547         nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 548         nmol    = mtop->molblock[mb].nmol/nmol_ch;
 549         for (mol = 0; mol < nmol_ch; mol++)
 550         {
 551             cgm = mol*cgs->nr;
 552             am  = mol*cgs->index[cgs->nr];
 553             for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
 554             {
 555                 check_solvent_cg(molt, cg_mol, nmol,
 556                                  mtop->groups.grpnr[egcQMMM] ?
 557                                  mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 558                                  &mtop->groups.grps[egcQMMM],
 559                                  fr,
 560                                  &n_solvent_parameters, &solvent_parameters,
 561                                  cginfo_mb[mb].cginfo[cgm+cg_mol],
 562                                  &cg_sp[mb][cgm+cg_mol]);
 563             }
 564         }
 565         cg_offset += cgs->nr;
 566         at_offset += cgs->index[cgs->nr];
 567     }
 568
 569     /* Puh! We finished going through all charge groups.
 570      * Now find the most common solvent model.
 571      */
 572
 573     /* Most common solvent this far */
 574     bestsp = -2;
 575     for (i = 0; i < n_solvent_parameters; i++)
 576     {
 577         if (bestsp == -2 ||
 578             solvent_parameters[i].count > solvent_parameters[bestsp].count)
 579         {
 580             bestsp = i;
 581         }
 582     }
 583
 584     if (bestsp >= 0)
 585     {
 586         bestsol = solvent_parameters[bestsp].model;
 587     }
 588     else
 589     {
 590         bestsol = esolNO;
 591     }
 592
 593 #ifdef DISABLE_WATER_NLIST
 594     bestsol = esolNO;
 595 #endif
 596
 597     fr->nWatMol = 0;
 598     for (mb = 0; mb < mtop->nmolblock; mb++)
 599     {
 600         cgs  = &mtop->moltype[mtop->molblock[mb].type].cgs;
 601         nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 602         for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
 603         {
 604             if (cg_sp[mb][i] == bestsp)
 605             {
 606                 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
 607                 fr->nWatMol += nmol;
 608             }
 609             else
 610             {
 611                 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
 612             }
 613         }
 614         sfree(cg_sp[mb]);
 615     }
 616     sfree(cg_sp);
 617
 618     if (bestsol != esolNO && fp != NULL)
 619     {
 620         fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
 621                 esol_names[bestsol],
 622                 solvent_parameters[bestsp].count);
 623     }
 624
 625     sfree(solvent_parameters);
 626     fr->solvent_opt = bestsol;
 627 }
 628
 629 enum {
 630     acNONE = 0, acCONSTRAINT, acSETTLE
 631 };
 632
 633 static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
 634                                    t_forcerec *fr, gmx_bool bNoSolvOpt,
 635                                    gmx_bool *bFEP_NonBonded,
 636                                    gmx_bool *bExcl_IntraCGAll_InterCGNone)
 637 {
 638     const t_block        *cgs;
 639     const t_blocka       *excl;
 640     const gmx_moltype_t  *molt;
 641     const gmx_molblock_t *molb;
 642     cginfo_mb_t          *cginfo_mb;
 643     gmx_bool             *type_VDW;
 644     int                  *cginfo;
 645     int                   cg_offset, a_offset, cgm, am;
 646     int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
 647     int                  *a_con;
 648     int                   ftype;
 649     int                   ia;
 650     gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bHavePerturbedAtoms;
 651
 652     ncg_tot = ncg_mtop(mtop);
 653     snew(cginfo_mb, mtop->nmolblock);
 654
 655     snew(type_VDW, fr->ntype);
 656     for (ai = 0; ai < fr->ntype; ai++)
 657     {
 658         type_VDW[ai] = FALSE;
 659         for (j = 0; j < fr->ntype; j++)
 660         {
 661             type_VDW[ai] = type_VDW[ai] ||
 662                 fr->bBHAM ||
 663                 C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
 664                 C12(fr->nbfp, fr->ntype, ai, j) != 0;
 665         }
 666     }
 667
 668     *bFEP_NonBonded               = FALSE;
 669     *bExcl_IntraCGAll_InterCGNone = TRUE;
 670
 671     excl_nalloc = 10;
 672     snew(bExcl, excl_nalloc);
 673     cg_offset = 0;
 674     a_offset  = 0;
 675     for (mb = 0; mb < mtop->nmolblock; mb++)
 676     {
 677         molb = &mtop->molblock[mb];
 678         molt = &mtop->moltype[molb->type];
 679         cgs  = &molt->cgs;
 680         excl = &molt->excls;
 681
 682         /* Check if the cginfo is identical for all molecules in this block.
 683          * If so, we only need an array of the size of one molecule.
 684          * Otherwise we make an array of #mol times #cgs per molecule.
 685          */
 686         bId = TRUE;
 687         am  = 0;
 688         for (m = 0; m < molb->nmol; m++)
 689         {
 690             am = m*cgs->index[cgs->nr];
 691             for (cg = 0; cg < cgs->nr; cg++)
 692             {
 693                 a0 = cgs->index[cg];
 694                 a1 = cgs->index[cg+1];
 695                 if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
 696                     ggrpnr(&mtop->groups, egcENER, a_offset   +a0))
 697                 {
 698                     bId = FALSE;
 699                 }
 700                 if (mtop->groups.grpnr[egcQMMM] != NULL)
 701                 {
 702                     for (ai = a0; ai < a1; ai++)
 703                     {
 704                         if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 705                             mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 706                         {
 707                             bId = FALSE;
 708                         }
 709                     }
 710                 }
 711             }
 712         }
 713
 714         cginfo_mb[mb].cg_start = cg_offset;
 715         cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 716         cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 717         snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
 718         cginfo = cginfo_mb[mb].cginfo;
 719
 720         /* Set constraints flags for constrained atoms */
 721         snew(a_con, molt->atoms.nr);
 722         for (ftype = 0; ftype < F_NRE; ftype++)
 723         {
 724             if (interaction_function[ftype].flags & IF_CONSTRAINT)
 725             {
 726                 int nral;
 727
 728                 nral = NRAL(ftype);
 729                 for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
 730                 {
 731                     int a;
 732
 733                     for (a = 0; a < nral; a++)
 734                     {
 735                         a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 736                             (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 737                     }
 738                 }
 739             }
 740         }
 741
 742         for (m = 0; m < (bId ? 1 : molb->nmol); m++)
 743         {
 744             cgm = m*cgs->nr;
 745             am  = m*cgs->index[cgs->nr];
 746             for (cg = 0; cg < cgs->nr; cg++)
 747             {
 748                 a0 = cgs->index[cg];
 749                 a1 = cgs->index[cg+1];
 750
 751                 /* Store the energy group in cginfo */
 752                 gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
 753                 SET_CGINFO_GID(cginfo[cgm+cg], gid);
 754
 755                 /* Check the intra/inter charge group exclusions */
 756                 if (a1-a0 > excl_nalloc)
 757                 {
 758                     excl_nalloc = a1 - a0;
 759                     srenew(bExcl, excl_nalloc);
 760                 }
 761                 /* bExclIntraAll: all intra cg interactions excluded
 762                  * bExclInter:    any inter cg interactions excluded
 763                  */
 764                 bExclIntraAll       = TRUE;
 765                 bExclInter          = FALSE;
 766                 bHaveVDW            = FALSE;
 767                 bHaveQ              = FALSE;
 768                 bHavePerturbedAtoms = FALSE;
 769                 for (ai = a0; ai < a1; ai++)
 770                 {
 771                     /* Check VDW and electrostatic interactions */
 772                     bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 773                                             type_VDW[molt->atoms.atom[ai].typeB]);
 774                     bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 775                                             molt->atoms.atom[ai].qB != 0);
 776
 777                     bHavePerturbedAtoms = bHavePerturbedAtoms || (PERTURBED(molt->atoms.atom[ai]) != 0);
 778
 779                     /* Clear the exclusion list for atom ai */
 780                     for (aj = a0; aj < a1; aj++)
 781                     {
 782                         bExcl[aj-a0] = FALSE;
 783                     }
 784                     /* Loop over all the exclusions of atom ai */
 785                     for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
 786                     {
 787                         aj = excl->a[j];
 788                         if (aj < a0 || aj >= a1)
 789                         {
 790                             bExclInter = TRUE;
 791                         }
 792                         else
 793                         {
 794                             bExcl[aj-a0] = TRUE;
 795                         }
 796                     }
 797                     /* Check if ai excludes a0 to a1 */
 798                     for (aj = a0; aj < a1; aj++)
 799                     {
 800                         if (!bExcl[aj-a0])
 801                         {
 802                             bExclIntraAll = FALSE;
 803                         }
 804                     }
 805
 806                     switch (a_con[ai])
 807                     {
 808                         case acCONSTRAINT:
 809                             SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 810                             break;
 811                         case acSETTLE:
 812                             SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 813                             break;
 814                         default:
 815                             break;
 816                     }
 817                 }
 818                 if (bExclIntraAll)
 819                 {
 820                     SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 821                 }
 822                 if (bExclInter)
 823                 {
 824                     SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 825                 }
 826                 if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 827                 {
 828                     /* The size in cginfo is currently only read with DD */
 829                     gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
 830                 }
 831                 if (bHaveVDW)
 832                 {
 833                     SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 834                 }
 835                 if (bHaveQ)
 836                 {
 837                     SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 838                 }
 839                 if (bHavePerturbedAtoms && fr->efep != efepNO)
 840                 {
 841                     SET_CGINFO_FEP(cginfo[cgm+cg]);
 842                     *bFEP_NonBonded = TRUE;
 843                 }
 844                 /* Store the charge group size */
 845                 SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
 846
 847                 if (!bExclIntraAll || bExclInter)
 848                 {
 849                     *bExcl_IntraCGAll_InterCGNone = FALSE;
 850                 }
 851             }
 852         }
 853
 854         sfree(a_con);
 855
 856         cg_offset += molb->nmol*cgs->nr;
 857         a_offset  += molb->nmol*cgs->index[cgs->nr];
 858     }
 859     sfree(bExcl);
 860
 861     /* the solvent optimizer is called after the QM is initialized,
 862      * because we don't want to have the QM subsystemto become an
 863      * optimized solvent
 864      */
 865
 866     check_solvent(fplog, mtop, fr, cginfo_mb);
 867
 868     if (getenv("GMX_NO_SOLV_OPT"))
 869     {
 870         if (fplog)
 871         {
 872             fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
 873                     "Disabling all solvent optimization\n");
 874         }
 875         fr->solvent_opt = esolNO;
 876     }
 877     if (bNoSolvOpt)
 878     {
 879         fr->solvent_opt = esolNO;
 880     }
 881     if (!fr->solvent_opt)
 882     {
 883         for (mb = 0; mb < mtop->nmolblock; mb++)
 884         {
 885             for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
 886             {
 887                 SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
 888             }
 889         }
 890     }
 891
 892     return cginfo_mb;
 893 }
 894
 895 static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
 896 {
 897     int  ncg, mb, cg;
 898     int *cginfo;
 899
 900     ncg = cgi_mb[nmb-1].cg_end;
 901     snew(cginfo, ncg);
 902     mb = 0;
 903     for (cg = 0; cg < ncg; cg++)
 904     {
 905         while (cg >= cgi_mb[mb].cg_end)
 906         {
 907             mb++;
 908         }
 909         cginfo[cg] =
 910             cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 911     }
 912
 913     return cginfo;
 914 }
 915
 916 static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
 917 {
 918     /*This now calculates sum for q and c6*/
 919     double         qsum, q2sum, q, c6sum, c6;
 920     int            mb, nmol, i;
 921     const t_atoms *atoms;
 922
 923     qsum   = 0;
 924     q2sum  = 0;
 925     c6sum  = 0;
 926     for (mb = 0; mb < mtop->nmolblock; mb++)
 927     {
 928         nmol  = mtop->molblock[mb].nmol;
 929         atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 930         for (i = 0; i < atoms->nr; i++)
 931         {
 932             q       = atoms->atom[i].q;
 933             qsum   += nmol*q;
 934             q2sum  += nmol*q*q;
 935             c6      = mtop->ffparams.iparams[atoms->atom[i].type*(mtop->ffparams.atnr+1)].lj.c6;
 936             c6sum  += nmol*c6;
 937         }
 938     }
 939     fr->qsum[0]   = qsum;
 940     fr->q2sum[0]  = q2sum;
 941     fr->c6sum[0]  = c6sum;
 942
 943     if (fr->efep != efepNO)
 944     {
 945         qsum   = 0;
 946         q2sum  = 0;
 947         c6sum  = 0;
 948         for (mb = 0; mb < mtop->nmolblock; mb++)
 949         {
 950             nmol  = mtop->molblock[mb].nmol;
 951             atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 952             for (i = 0; i < atoms->nr; i++)
 953             {
 954                 q       = atoms->atom[i].qB;
 955                 qsum   += nmol*q;
 956                 q2sum  += nmol*q*q;
 957                 c6      = mtop->ffparams.iparams[atoms->atom[i].typeB*(mtop->ffparams.atnr+1)].lj.c6;
 958                 c6sum  += nmol*c6;
 959             }
 960             fr->qsum[1]   = qsum;
 961             fr->q2sum[1]  = q2sum;
 962             fr->c6sum[1]  = c6sum;
 963         }
 964     }
 965     else
 966     {
 967         fr->qsum[1]   = fr->qsum[0];
 968         fr->q2sum[1]  = fr->q2sum[0];
 969         fr->c6sum[1]  = fr->c6sum[0];
 970     }
 971     if (log)
 972     {
 973         if (fr->efep == efepNO)
 974         {
 975             fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
 976         }
 977         else
 978         {
 979             fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
 980                     fr->qsum[0], fr->qsum[1]);
 981         }
 982     }
 983 }
 984
 985 void update_forcerec(t_forcerec *fr, matrix box)
 986 {
 987     if (fr->eeltype == eelGRF)
 988     {
 989         calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
 990                    fr->rcoulomb, fr->temp, fr->zsquare, box,
 991                    &fr->kappa, &fr->k_rf, &fr->c_rf);
 992     }
 993 }
 994
 995 void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
 996 {
 997     const t_atoms  *atoms, *atoms_tpi;
 998     const t_blocka *excl;
 999     int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
1000     gmx_int64_t     npair, npair_ij, tmpi, tmpj;
1001     double          csix, ctwelve;
1002     int             ntp, *typecount;
1003     gmx_bool        bBHAM;
1004     real           *nbfp;
1005     real           *nbfp_comb = NULL;
1006
1007     ntp   = fr->ntype;
1008     bBHAM = fr->bBHAM;
1009     nbfp  = fr->nbfp;
1010
1011     /* For LJ-PME, we want to correct for the difference between the
1012      * actual C6 values and the C6 values used by the LJ-PME based on
1013      * combination rules. */
1014
1015     if (EVDW_PME(fr->vdwtype))
1016     {
1017         nbfp_comb = mk_nbfp_combination_rule(&mtop->ffparams,
1018                                              (fr->ljpme_combination_rule == eljpmeLB) ? eCOMB_ARITHMETIC : eCOMB_GEOMETRIC);
1019         for (tpi = 0; tpi < ntp; ++tpi)
1020         {
1021             for (tpj = 0; tpj < ntp; ++tpj)
1022             {
1023                 C6(nbfp_comb, ntp, tpi, tpj) =
1024                     C6(nbfp, ntp, tpi, tpj) - C6(nbfp_comb, ntp, tpi, tpj);
1025                 C12(nbfp_comb, ntp, tpi, tpj) = C12(nbfp, ntp, tpi, tpj);
1026             }
1027         }
1028         nbfp = nbfp_comb;
1029     }
1030     for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
1031     {
1032         csix    = 0;
1033         ctwelve = 0;
1034         npair   = 0;
1035         nexcl   = 0;
1036         if (!fr->n_tpi)
1037         {
1038             /* Count the types so we avoid natoms^2 operations */
1039             snew(typecount, ntp);
1040             gmx_mtop_count_atomtypes(mtop, q, typecount);
1041
1042             for (tpi = 0; tpi < ntp; tpi++)
1043             {
1044                 for (tpj = tpi; tpj < ntp; tpj++)
1045                 {
1046                     tmpi = typecount[tpi];
1047                     tmpj = typecount[tpj];
1048                     if (tpi != tpj)
1049                     {
1050                         npair_ij = tmpi*tmpj;
1051                     }
1052                     else
1053                     {
1054                         npair_ij = tmpi*(tmpi - 1)/2;
1055                     }
1056                     if (bBHAM)
1057                     {
1058                         /* nbfp now includes the 6.0 derivative prefactor */
1059                         csix    += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1060                     }
1061                     else
1062                     {
1063                         /* nbfp now includes the 6.0/12.0 derivative prefactors */
1064                         csix    += npair_ij*   C6(nbfp, ntp, tpi, tpj)/6.0;
1065                         ctwelve += npair_ij*  C12(nbfp, ntp, tpi, tpj)/12.0;
1066                     }
1067                     npair += npair_ij;
1068                 }
1069             }
1070             sfree(typecount);
1071             /* Subtract the excluded pairs.
1072              * The main reason for substracting exclusions is that in some cases
1073              * some combinations might never occur and the parameters could have
1074              * any value. These unused values should not influence the dispersion
1075              * correction.
1076              */
1077             for (mb = 0; mb < mtop->nmolblock; mb++)
1078             {
1079                 nmol  = mtop->molblock[mb].nmol;
1080                 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
1081                 excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
1082                 for (i = 0; (i < atoms->nr); i++)
1083                 {
1084                     if (q == 0)
1085                     {
1086                         tpi = atoms->atom[i].type;
1087                     }
1088                     else
1089                     {
1090                         tpi = atoms->atom[i].typeB;
1091                     }
1092                     j1  = excl->index[i];
1093                     j2  = excl->index[i+1];
1094                     for (j = j1; j < j2; j++)
1095                     {
1096                         k = excl->a[j];
1097                         if (k > i)
1098                         {
1099                             if (q == 0)
1100                             {
1101                                 tpj = atoms->atom[k].type;
1102                             }
1103                             else
1104                             {
1105                                 tpj = atoms->atom[k].typeB;
1106                             }
1107                             if (bBHAM)
1108                             {
1109                                 /* nbfp now includes the 6.0 derivative prefactor */
1110                                 csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1111                             }
1112                             else
1113                             {
1114                                 /* nbfp now includes the 6.0/12.0 derivative prefactors */
1115                                 csix    -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
1116                                 ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
1117                             }
1118                             nexcl += nmol;
1119                         }
1120                     }
1121                 }
1122             }
1123         }
1124         else
1125         {
1126             /* Only correct for the interaction of the test particle
1127              * with the rest of the system.
1128              */
1129             atoms_tpi =
1130                 &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
1131
1132             npair = 0;
1133             for (mb = 0; mb < mtop->nmolblock; mb++)
1134             {
1135                 nmol  = mtop->molblock[mb].nmol;
1136                 atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
1137                 for (j = 0; j < atoms->nr; j++)
1138                 {
1139                     nmolc = nmol;
1140                     /* Remove the interaction of the test charge group
1141                      * with itself.
1142                      */
1143                     if (mb == mtop->nmolblock-1)
1144                     {
1145                         nmolc--;
1146
1147                         if (mb == 0 && nmol == 1)
1148                         {
1149                             gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
1150                         }
1151                     }
1152                     if (q == 0)
1153                     {
1154                         tpj = atoms->atom[j].type;
1155                     }
1156                     else
1157                     {
1158                         tpj = atoms->atom[j].typeB;
1159                     }
1160                     for (i = 0; i < fr->n_tpi; i++)
1161                     {
1162                         if (q == 0)
1163                         {
1164                             tpi = atoms_tpi->atom[i].type;
1165                         }
1166                         else
1167                         {
1168                             tpi = atoms_tpi->atom[i].typeB;
1169                         }
1170                         if (bBHAM)
1171                         {
1172                             /* nbfp now includes the 6.0 derivative prefactor */
1173                             csix    += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
1174                         }
1175                         else
1176                         {
1177                             /* nbfp now includes the 6.0/12.0 derivative prefactors */
1178                             csix    += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
1179                             ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
1180                         }
1181                         npair += nmolc;
1182                     }
1183                 }
1184             }
1185         }
1186         if (npair - nexcl <= 0 && fplog)
1187         {
1188             fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
1189             csix     = 0;
1190             ctwelve  = 0;
1191         }
1192         else
1193         {
1194             csix    /= npair - nexcl;
1195             ctwelve /= npair - nexcl;
1196         }
1197         if (debug)
1198         {
1199             fprintf(debug, "Counted %d exclusions\n", nexcl);
1200             fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
1201             fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
1202         }
1203         fr->avcsix[q]    = csix;
1204         fr->avctwelve[q] = ctwelve;
1205     }
1206
1207     if (EVDW_PME(fr->vdwtype))
1208     {
1209         sfree(nbfp_comb);
1210     }
1211
1212     if (fplog != NULL)
1213     {
1214         if (fr->eDispCorr == edispcAllEner ||
1215             fr->eDispCorr == edispcAllEnerPres)
1216         {
1217             fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
1218                     fr->avcsix[0], fr->avctwelve[0]);
1219         }
1220         else
1221         {
1222             fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
1223         }
1224     }
1225 }
1226
1227
1228 static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
1229                            const gmx_mtop_t *mtop)
1230 {
1231     const t_atoms *at1, *at2;
1232     int            mt1, mt2, i, j, tpi, tpj, ntypes;
1233     real           b, bmin;
1234     real          *nbfp;
1235
1236     if (fplog)
1237     {
1238         fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
1239     }
1240     nbfp   = fr->nbfp;
1241     ntypes = fr->ntype;
1242
1243     bmin           = -1;
1244     fr->bham_b_max = 0;
1245     for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
1246     {
1247         at1 = &mtop->moltype[mt1].atoms;
1248         for (i = 0; (i < at1->nr); i++)
1249         {
1250             tpi = at1->atom[i].type;
1251             if (tpi >= ntypes)
1252             {
1253                 gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
1254             }
1255
1256             for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
1257             {
1258                 at2 = &mtop->moltype[mt2].atoms;
1259                 for (j = 0; (j < at2->nr); j++)
1260                 {
1261                     tpj = at2->atom[j].type;
1262                     if (tpj >= ntypes)
1263                     {
1264                         gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
1265                     }
1266                     b = BHAMB(nbfp, ntypes, tpi, tpj);
1267                     if (b > fr->bham_b_max)
1268                     {
1269                         fr->bham_b_max = b;
1270                     }
1271                     if ((b < bmin) || (bmin == -1))
1272                     {
1273                         bmin = b;
1274                     }
1275                 }
1276             }
1277         }
1278     }
1279     if (fplog)
1280     {
1281         fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
1282                 bmin, fr->bham_b_max);
1283     }
1284 }
1285
1286 static void make_nbf_tables(FILE *fp, const output_env_t oenv,
1287                             t_forcerec *fr, real rtab,
1288                             const t_commrec *cr,
1289                             const char *tabfn, char *eg1, char *eg2,
1290                             t_nblists *nbl)
1291 {
1292     char buf[STRLEN];
1293     int  i, j;
1294
1295     if (tabfn == NULL)
1296     {
1297         if (debug)
1298         {
1299             fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
1300         }
1301         return;
1302     }
1303
1304     sprintf(buf, "%s", tabfn);
1305     if (eg1 && eg2)
1306     {
1307         /* Append the two energy group names */
1308         sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
1309                 eg1, eg2, ftp2ext(efXVG));
1310     }
1311     nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
1312     /* Copy the contents of the table to separate coulomb and LJ tables too,
1313      * to improve cache performance.
1314      */
1315     /* For performance reasons we want
1316      * the table data to be aligned to 16-byte. The pointers could be freed
1317      * but currently aren't.
1318      */
1319     nbl->table_elec.interaction   = GMX_TABLE_INTERACTION_ELEC;
1320     nbl->table_elec.format        = nbl->table_elec_vdw.format;
1321     nbl->table_elec.r             = nbl->table_elec_vdw.r;
1322     nbl->table_elec.n             = nbl->table_elec_vdw.n;
1323     nbl->table_elec.scale         = nbl->table_elec_vdw.scale;
1324     nbl->table_elec.scale_exp     = nbl->table_elec_vdw.scale_exp;
1325     nbl->table_elec.formatsize    = nbl->table_elec_vdw.formatsize;
1326     nbl->table_elec.ninteractions = 1;
1327     nbl->table_elec.stride        = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
1328     snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
1329
1330     nbl->table_vdw.interaction   = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
1331     nbl->table_vdw.format        = nbl->table_elec_vdw.format;
1332     nbl->table_vdw.r             = nbl->table_elec_vdw.r;
1333     nbl->table_vdw.n             = nbl->table_elec_vdw.n;
1334     nbl->table_vdw.scale         = nbl->table_elec_vdw.scale;
1335     nbl->table_vdw.scale_exp     = nbl->table_elec_vdw.scale_exp;
1336     nbl->table_vdw.formatsize    = nbl->table_elec_vdw.formatsize;
1337     nbl->table_vdw.ninteractions = 2;
1338     nbl->table_vdw.stride        = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
1339     snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
1340
1341     for (i = 0; i <= nbl->table_elec_vdw.n; i++)
1342     {
1343         for (j = 0; j < 4; j++)
1344         {
1345             nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
1346         }
1347         for (j = 0; j < 8; j++)
1348         {
1349             nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
1350         }
1351     }
1352 }
1353
1354 static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
1355                          int *ncount, int **count)
1356 {
1357     const gmx_moltype_t *molt;
1358     const t_ilist       *il;
1359     int                  mt, ftype, stride, i, j, tabnr;
1360
1361     for (mt = 0; mt < mtop->nmoltype; mt++)
1362     {
1363         molt = &mtop->moltype[mt];
1364         for (ftype = 0; ftype < F_NRE; ftype++)
1365         {
1366             if (ftype == ftype1 || ftype == ftype2)
1367             {
1368                 il     = &molt->ilist[ftype];
1369                 stride = 1 + NRAL(ftype);
1370                 for (i = 0; i < il->nr; i += stride)
1371                 {
1372                     tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
1373                     if (tabnr < 0)
1374                     {
1375                         gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
1376                     }
1377                     if (tabnr >= *ncount)
1378                     {
1379                         srenew(*count, tabnr+1);
1380                         for (j = *ncount; j < tabnr+1; j++)
1381                         {
1382                             (*count)[j] = 0;
1383                         }
1384                         *ncount = tabnr+1;
1385                     }
1386                     (*count)[tabnr]++;
1387                 }
1388             }
1389         }
1390     }
1391 }
1392
1393 static bondedtable_t *make_bonded_tables(FILE *fplog,
1394                                          int ftype1, int ftype2,
1395                                          const gmx_mtop_t *mtop,
1396                                          const char *basefn, const char *tabext)
1397 {
1398     int            i, ncount, *count;
1399     char           tabfn[STRLEN];
1400     bondedtable_t *tab;
1401
1402     tab = NULL;
1403
1404     ncount = 0;
1405     count  = NULL;
1406     count_tables(ftype1, ftype2, mtop, &ncount, &count);
1407
1408     if (ncount > 0)
1409     {
1410         snew(tab, ncount);
1411         for (i = 0; i < ncount; i++)
1412         {
1413             if (count[i] > 0)
1414             {
1415                 sprintf(tabfn, "%s", basefn);
1416                 sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
1417                         tabext, i, ftp2ext(efXVG));
1418                 tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
1419             }
1420         }
1421         sfree(count);
1422     }
1423
1424     return tab;
1425 }
1426
1427 void forcerec_set_ranges(t_forcerec *fr,
1428                          int ncg_home, int ncg_force,
1429                          int natoms_force,
1430                          int natoms_force_constr, int natoms_f_novirsum)
1431 {
1432     fr->cg0 = 0;
1433     fr->hcg = ncg_home;
1434
1435     /* fr->ncg_force is unused in the standard code,
1436      * but it can be useful for modified code dealing with charge groups.
1437      */
1438     fr->ncg_force           = ncg_force;
1439     fr->natoms_force        = natoms_force;
1440     fr->natoms_force_constr = natoms_force_constr;
1441
1442     if (fr->natoms_force_constr > fr->nalloc_force)
1443     {
1444         fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
1445
1446         if (fr->bTwinRange)
1447         {
1448             srenew(fr->f_twin, fr->nalloc_force);
1449         }
1450     }
1451
1452     if (fr->bF_NoVirSum)
1453     {
1454         fr->f_novirsum_n = natoms_f_novirsum;
1455         if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
1456         {
1457             fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
1458             srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
1459         }
1460     }
1461     else
1462     {
1463         fr->f_novirsum_n = 0;
1464     }
1465 }
1466
1467 static real cutoff_inf(real cutoff)
1468 {
1469     if (cutoff == 0)
1470     {
1471         cutoff = GMX_CUTOFF_INF;
1472     }
1473
1474     return cutoff;
1475 }
1476
1477 static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
1478                                   t_forcerec *fr, const t_inputrec *ir,
1479                                   const char *tabfn, const gmx_mtop_t *mtop,
1480                                   matrix     box)
1481 {
1482     char buf[STRLEN];
1483     int  i, j;
1484
1485     if (tabfn == NULL)
1486     {
1487         gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
1488         return;
1489     }
1490
1491     snew(fr->atf_tabs, ir->adress->n_tf_grps);
1492
1493     sprintf(buf, "%s", tabfn);
1494     for (i = 0; i < ir->adress->n_tf_grps; i++)
1495     {
1496         j = ir->adress->tf_table_index[i]; /* get energy group index */
1497         sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
1498                 *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
1499         if (fp)
1500         {
1501             fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
1502         }
1503         fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
1504     }
1505
1506 }
1507
1508 gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
1509 {
1510     gmx_bool bAllvsAll;
1511
1512     bAllvsAll =
1513         (
1514             ir->rlist == 0            &&
1515             ir->rcoulomb == 0         &&
1516             ir->rvdw == 0             &&
1517             ir->ePBC == epbcNONE      &&
1518             ir->vdwtype == evdwCUT    &&
1519             ir->coulombtype == eelCUT &&
1520             ir->efep == efepNO        &&
1521             (ir->implicit_solvent == eisNO ||
1522              (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
1523                                                   ir->gb_algorithm == egbHCT   ||
1524                                                   ir->gb_algorithm == egbOBC))) &&
1525             getenv("GMX_NO_ALLVSALL") == NULL
1526         );
1527
1528     if (bAllvsAll && ir->opts.ngener > 1)
1529     {
1530         const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
1531
1532         if (bPrintNote)
1533         {
1534             if (MASTER(cr))
1535             {
1536                 fprintf(stderr, "\n%s\n", note);
1537             }
1538             if (fp != NULL)
1539             {
1540                 fprintf(fp, "\n%s\n", note);
1541             }
1542         }
1543         bAllvsAll = FALSE;
1544     }
1545
1546     if (bAllvsAll && fp && MASTER(cr))
1547     {
1548         fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n");
1549     }
1550
1551     return bAllvsAll;
1552 }
1553
1554
1555 static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
1556 {
1557     int t, i;
1558
1559     /* These thread local data structures are used for bondeds only */
1560     fr->nthreads = gmx_omp_nthreads_get(emntBonded);
1561
1562     if (fr->nthreads > 1)
1563     {
1564         snew(fr->f_t, fr->nthreads);
1565         /* Thread 0 uses the global force and energy arrays */
1566         for (t = 1; t < fr->nthreads; t++)
1567         {
1568             fr->f_t[t].f        = NULL;
1569             fr->f_t[t].f_nalloc = 0;
1570             snew(fr->f_t[t].fshift, SHIFTS);
1571             fr->f_t[t].grpp.nener = nenergrp*nenergrp;
1572             for (i = 0; i < egNR; i++)
1573             {
1574                 snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
1575             }
1576         }
1577     }
1578 }
1579
1580
1581 gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
1582                                       const t_commrec  *cr,
1583                                       const t_inputrec *ir,
1584                                       gmx_bool          bGPU)
1585 {
1586     if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
1587     {
1588         md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
1589                       bGPU ? "GPUs" : "SIMD kernels",
1590                       bGPU ? "CPU only" : "plain-C kernels");
1591         return FALSE;
1592     }
1593
1594     return TRUE;
1595 }
1596
1597
1598 static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
1599                                   int                         *kernel_type,
1600                                   int                         *ewald_excl)
1601 {
1602     *kernel_type = nbnxnk4x4_PlainC;
1603     *ewald_excl  = ewaldexclTable;
1604
1605 #ifdef GMX_NBNXN_SIMD
1606     {
1607 #ifdef GMX_NBNXN_SIMD_4XN
1608         *kernel_type = nbnxnk4xN_SIMD_4xN;
1609 #endif
1610 #ifdef GMX_NBNXN_SIMD_2XNN
1611         *kernel_type = nbnxnk4xN_SIMD_2xNN;
1612 #endif
1613
1614 #if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
1615         /* We need to choose if we want 2x(N+N) or 4xN kernels.
1616          * Currently this is based on the SIMD acceleration choice,
1617          * but it might be better to decide this at runtime based on CPU.
1618          *
1619          * 4xN calculates more (zero) interactions, but has less pair-search
1620          * work and much better kernel instruction scheduling.
1621          *
1622          * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
1623          * which doesn't have FMA, both the analytical and tabulated Ewald
1624          * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
1625          * 2x(4+4) because it results in significantly fewer pairs.
1626          * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
1627          * 10% with HT, 50% without HT. As we currently don't detect the actual
1628          * use of HT, use 4x8 to avoid a potential performance hit.
1629          * On Intel Haswell 4x8 is always faster.
1630          */
1631         *kernel_type = nbnxnk4xN_SIMD_4xN;
1632
1633 #ifndef GMX_SIMD_HAVE_FMA
1634         if (EEL_PME_EWALD(ir->coulombtype) ||
1635             EVDW_PME(ir->vdwtype))
1636         {
1637             /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
1638              * There are enough instructions to make 2x(4+4) efficient.
1639              */
1640             *kernel_type = nbnxnk4xN_SIMD_2xNN;
1641         }
1642 #endif
1643 #endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
1644
1645
1646         if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
1647         {
1648 #ifdef GMX_NBNXN_SIMD_4XN
1649             *kernel_type = nbnxnk4xN_SIMD_4xN;
1650 #else
1651             gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
1652 #endif
1653         }
1654         if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
1655         {
1656 #ifdef GMX_NBNXN_SIMD_2XNN
1657             *kernel_type = nbnxnk4xN_SIMD_2xNN;
1658 #else
1659             gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
1660 #endif
1661         }
1662
1663         /* Analytical Ewald exclusion correction is only an option in
1664          * the SIMD kernel.
1665          * Since table lookup's don't parallelize with SIMD, analytical
1666          * will probably always be faster for a SIMD width of 8 or more.
1667          * With FMA analytical is sometimes faster for a width if 4 as well.
1668          * On BlueGene/Q, this is faster regardless of precision.
1669          * In single precision, this is faster on Bulldozer.
1670          */
1671 #if GMX_SIMD_REAL_WIDTH >= 8 || \
1672         (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
1673         defined GMX_SIMD_IBM_QPX
1674         *ewald_excl = ewaldexclAnalytical;
1675 #endif
1676         if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
1677         {
1678             *ewald_excl = ewaldexclTable;
1679         }
1680         if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
1681         {
1682             *ewald_excl = ewaldexclAnalytical;
1683         }
1684
1685     }
1686 #endif /* GMX_NBNXN_SIMD */
1687 }
1688
1689
1690 const char *lookup_nbnxn_kernel_name(int kernel_type)
1691 {
1692     const char *returnvalue = NULL;
1693     switch (kernel_type)
1694     {
1695         case nbnxnkNotSet:
1696             returnvalue = "not set";
1697             break;
1698         case nbnxnk4x4_PlainC:
1699             returnvalue = "plain C";
1700             break;
1701         case nbnxnk4xN_SIMD_4xN:
1702         case nbnxnk4xN_SIMD_2xNN:
1703 #ifdef GMX_NBNXN_SIMD
1704 #if defined GMX_SIMD_X86_SSE2
1705             returnvalue = "SSE2";
1706 #elif defined GMX_SIMD_X86_SSE4_1
1707             returnvalue = "SSE4.1";
1708 #elif defined GMX_SIMD_X86_AVX_128_FMA
1709             returnvalue = "AVX_128_FMA";
1710 #elif defined GMX_SIMD_X86_AVX_256
1711             returnvalue = "AVX_256";
1712 #elif defined GMX_SIMD_X86_AVX2_256
1713             returnvalue = "AVX2_256";
1714 #else
1715             returnvalue = "SIMD";
1716 #endif
1717 #else  /* GMX_NBNXN_SIMD */
1718             returnvalue = "not available";
1719 #endif /* GMX_NBNXN_SIMD */
1720             break;
1721         case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
1722         case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
1723
1724         case nbnxnkNR:
1725         default:
1726             gmx_fatal(FARGS, "Illegal kernel type selected");
1727             returnvalue = NULL;
1728             break;
1729     }
1730     return returnvalue;
1731 };
1732
1733 static void pick_nbnxn_kernel(FILE                *fp,
1734                               const t_commrec     *cr,
1735                               gmx_bool             use_simd_kernels,
1736                               gmx_bool             bUseGPU,
1737                               gmx_bool             bEmulateGPU,
1738                               const t_inputrec    *ir,
1739                               int                 *kernel_type,
1740                               int                 *ewald_excl,
1741                               gmx_bool             bDoNonbonded)
1742 {
1743     assert(kernel_type);
1744
1745     *kernel_type = nbnxnkNotSet;
1746     *ewald_excl  = ewaldexclTable;
1747
1748     if (bEmulateGPU)
1749     {
1750         *kernel_type = nbnxnk8x8x8_PlainC;
1751
1752         if (bDoNonbonded)
1753         {
1754             md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
1755         }
1756     }
1757     else if (bUseGPU)
1758     {
1759         *kernel_type = nbnxnk8x8x8_CUDA;
1760     }
1761
1762     if (*kernel_type == nbnxnkNotSet)
1763     {
1764         /* LJ PME with LB combination rule does 7 mesh operations.
1765          * This so slow that we don't compile SIMD non-bonded kernels for that.
1766          */
1767         if (use_simd_kernels &&
1768             nbnxn_acceleration_supported(fp, cr, ir, FALSE))
1769         {
1770             pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
1771         }
1772         else
1773         {
1774             *kernel_type = nbnxnk4x4_PlainC;
1775         }
1776     }
1777
1778     if (bDoNonbonded && fp != NULL)
1779     {
1780         fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
1781                 lookup_nbnxn_kernel_name(*kernel_type),
1782                 nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
1783                 nbnxn_kernel_to_cj_size(*kernel_type));
1784     }
1785 }
1786
1787 static void pick_nbnxn_resources(const t_commrec     *cr,
1788                                  const gmx_hw_info_t *hwinfo,
1789                                  gmx_bool             bDoNonbonded,
1790                                  gmx_bool            *bUseGPU,
1791                                  gmx_bool            *bEmulateGPU,
1792                                  const gmx_gpu_opt_t *gpu_opt)
1793 {
1794     gmx_bool bEmulateGPUEnvVarSet;
1795     char     gpu_err_str[STRLEN];
1796
1797     *bUseGPU = FALSE;
1798
1799     bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
1800
1801     /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
1802      * GPUs (currently) only handle non-bonded calculations, we will
1803      * automatically switch to emulation if non-bonded calculations are
1804      * turned off via GMX_NO_NONBONDED - this is the simple and elegant
1805      * way to turn off GPU initialization, data movement, and cleanup.
1806      *
1807      * GPU emulation can be useful to assess the performance one can expect by
1808      * adding GPU(s) to the machine. The conditional below allows this even
1809      * if mdrun is compiled without GPU acceleration support.
1810      * Note that you should freezing the system as otherwise it will explode.
1811      */
1812     *bEmulateGPU = (bEmulateGPUEnvVarSet ||
1813                     (!bDoNonbonded &&
1814                      gpu_opt->ncuda_dev_use > 0));
1815
1816     /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
1817      */
1818     if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
1819     {
1820         /* Each PP node will use the intra-node id-th device from the
1821          * list of detected/selected GPUs. */
1822         if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
1823                       &hwinfo->gpu_info, gpu_opt))
1824         {
1825             /* At this point the init should never fail as we made sure that
1826              * we have all the GPUs we need. If it still does, we'll bail. */
1827             gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
1828                       cr->nodeid,
1829                       get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
1830                                         cr->rank_pp_intranode),
1831                       gpu_err_str);
1832         }
1833
1834         /* Here we actually turn on hardware GPU acceleration */
1835         *bUseGPU = TRUE;
1836     }
1837 }
1838
1839 gmx_bool uses_simple_tables(int                 cutoff_scheme,
1840                             nonbonded_verlet_t *nbv,
1841                             int                 group)
1842 {
1843     gmx_bool bUsesSimpleTables = TRUE;
1844     int      grp_index;
1845
1846     switch (cutoff_scheme)
1847     {
1848         case ecutsGROUP:
1849             bUsesSimpleTables = TRUE;
1850             break;
1851         case ecutsVERLET:
1852             assert(NULL != nbv && NULL != nbv->grp);
1853             grp_index         = (group < 0) ? 0 : (nbv->ngrp - 1);
1854             bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
1855             break;
1856         default:
1857             gmx_incons("unimplemented");
1858     }
1859     return bUsesSimpleTables;
1860 }
1861
1862 static void init_ewald_f_table(interaction_const_t *ic,
1863                                gmx_bool             bUsesSimpleTables,
1864                                real                 rtab)
1865 {
1866     real maxr;
1867
1868     if (bUsesSimpleTables)
1869     {
1870         /* With a spacing of 0.0005 we are at the force summation accuracy
1871          * for the SSE kernels for "normal" atomistic simulations.
1872          */
1873         ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff_q,
1874                                                    ic->rcoulomb);
1875
1876         maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
1877         ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
1878     }
1879     else
1880     {
1881         ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
1882         /* Subtract 2 iso 1 to avoid access out of range due to rounding */
1883         ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
1884     }
1885
1886     sfree_aligned(ic->tabq_coul_FDV0);
1887     sfree_aligned(ic->tabq_coul_F);
1888     sfree_aligned(ic->tabq_coul_V);
1889
1890     /* Create the original table data in FDV0 */
1891     snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
1892     snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
1893     snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
1894     table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
1895                                 ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_q);
1896 }
1897
1898 void init_interaction_const_tables(FILE                *fp,
1899                                    interaction_const_t *ic,
1900                                    gmx_bool             bUsesSimpleTables,
1901                                    real                 rtab)
1902 {
1903     real spacing;
1904
1905     if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
1906     {
1907         init_ewald_f_table(ic, bUsesSimpleTables, rtab);
1908
1909         if (fp != NULL)
1910         {
1911             fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
1912                     1/ic->tabq_scale, ic->tabq_size);
1913         }
1914     }
1915 }
1916
1917 static void clear_force_switch_constants(shift_consts_t *sc)
1918 {
1919     sc->c2   = 0;
1920     sc->c3   = 0;
1921     sc->cpot = 0;
1922 }
1923
1924 static void force_switch_constants(real p,
1925                                    real rsw, real rc,
1926                                    shift_consts_t *sc)
1927 {
1928     /* Here we determine the coefficient for shifting the force to zero
1929      * between distance rsw and the cut-off rc.
1930      * For a potential of r^-p, we have force p*r^-(p+1).
1931      * But to save flops we absorb p in the coefficient.
1932      * Thus we get:
1933      * force/p   = r^-(p+1) + c2*r^2 + c3*r^3
1934      * potential = r^-p + c2/3*r^3 + c3/4*r^4 + cpot
1935      */
1936     sc->c2   =  ((p + 1)*rsw - (p + 4)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 2));
1937     sc->c3   = -((p + 1)*rsw - (p + 3)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 3));
1938     sc->cpot = -pow(rc, -p) + p*sc->c2/3*pow(rc - rsw, 3) + p*sc->c3/4*pow(rc - rsw, 4);
1939 }
1940
1941 static void potential_switch_constants(real rsw, real rc,
1942                                        switch_consts_t *sc)
1943 {
1944     /* The switch function is 1 at rsw and 0 at rc.
1945      * The derivative and second derivate are zero at both ends.
1946      * rsw        = max(r - r_switch, 0)
1947      * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
1948      * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
1949      * force      = force*dsw - potential*sw
1950      * potential *= sw
1951      */
1952     sc->c3 = -10*pow(rc - rsw, -3);
1953     sc->c4 =  15*pow(rc - rsw, -4);
1954     sc->c5 =  -6*pow(rc - rsw, -5);
1955 }
1956
1957 static void
1958 init_interaction_const(FILE                       *fp,
1959                        const t_commrec gmx_unused *cr,
1960                        interaction_const_t       **interaction_const,
1961                        const t_forcerec           *fr,
1962                        real                        rtab)
1963 {
1964     interaction_const_t *ic;
1965     gmx_bool             bUsesSimpleTables = TRUE;
1966
1967     snew(ic, 1);
1968
1969     /* Just allocate something so we can free it */
1970     snew_aligned(ic->tabq_coul_FDV0, 16, 32);
1971     snew_aligned(ic->tabq_coul_F, 16, 32);
1972     snew_aligned(ic->tabq_coul_V, 16, 32);
1973
1974     ic->rlist           = fr->rlist;
1975     ic->rlistlong       = fr->rlistlong;
1976
1977     /* Lennard-Jones */
1978     ic->vdwtype         = fr->vdwtype;
1979     ic->vdw_modifier    = fr->vdw_modifier;
1980     ic->rvdw            = fr->rvdw;
1981     ic->rvdw_switch     = fr->rvdw_switch;
1982     ic->ewaldcoeff_lj   = fr->ewaldcoeff_lj;
1983     ic->ljpme_comb_rule = fr->ljpme_combination_rule;
1984     ic->sh_lj_ewald     = 0;
1985     clear_force_switch_constants(&ic->dispersion_shift);
1986     clear_force_switch_constants(&ic->repulsion_shift);
1987
1988     switch (ic->vdw_modifier)
1989     {
1990         case eintmodPOTSHIFT:
1991             /* Only shift the potential, don't touch the force */
1992             ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
1993             ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
1994             if (EVDW_PME(ic->vdwtype))
1995             {
1996                 real crc2;
1997
1998                 crc2            = sqr(ic->ewaldcoeff_lj*ic->rvdw);
1999                 ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
2000             }
2001             break;
2002         case eintmodFORCESWITCH:
2003             /* Switch the force, switch and shift the potential */
2004             force_switch_constants(6.0, ic->rvdw_switch, ic->rvdw,
2005                                    &ic->dispersion_shift);
2006             force_switch_constants(12.0, ic->rvdw_switch, ic->rvdw,
2007                                    &ic->repulsion_shift);
2008             break;
2009         case eintmodPOTSWITCH:
2010             /* Switch the potential and force */
2011             potential_switch_constants(ic->rvdw_switch, ic->rvdw,
2012                                        &ic->vdw_switch);
2013             break;
2014         case eintmodNONE:
2015         case eintmodEXACTCUTOFF:
2016             /* Nothing to do here */
2017             break;
2018         default:
2019             gmx_incons("unimplemented potential modifier");
2020     }
2021
2022     ic->sh_invrc6 = -ic->dispersion_shift.cpot;
2023
2024     /* Electrostatics */
2025     ic->eeltype          = fr->eeltype;
2026     ic->coulomb_modifier = fr->coulomb_modifier;
2027     ic->rcoulomb         = fr->rcoulomb;
2028     ic->epsilon_r        = fr->epsilon_r;
2029     ic->epsfac           = fr->epsfac;
2030     ic->ewaldcoeff_q     = fr->ewaldcoeff_q;
2031
2032     if (fr->coulomb_modifier == eintmodPOTSHIFT)
2033     {
2034         ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
2035     }
2036     else
2037     {
2038         ic->sh_ewald = 0;
2039     }
2040
2041     /* Reaction-field */
2042     if (EEL_RF(ic->eeltype))
2043     {
2044         ic->epsilon_rf = fr->epsilon_rf;
2045         ic->k_rf       = fr->k_rf;
2046         ic->c_rf       = fr->c_rf;
2047     }
2048     else
2049     {
2050         /* For plain cut-off we might use the reaction-field kernels */
2051         ic->epsilon_rf = ic->epsilon_r;
2052         ic->k_rf       = 0;
2053         if (fr->coulomb_modifier == eintmodPOTSHIFT)
2054         {
2055             ic->c_rf   = 1/ic->rcoulomb;
2056         }
2057         else
2058         {
2059             ic->c_rf   = 0;
2060         }
2061     }
2062
2063     if (fp != NULL)
2064     {
2065         real dispersion_shift;
2066
2067         dispersion_shift = ic->dispersion_shift.cpot;
2068         if (EVDW_PME(ic->vdwtype))
2069         {
2070             dispersion_shift -= ic->sh_lj_ewald;
2071         }
2072         fprintf(fp, "Potential shift: LJ r^-12: %.3e r^-6: %.3e",
2073                 ic->repulsion_shift.cpot, dispersion_shift);
2074
2075         if (ic->eeltype == eelCUT)
2076         {
2077             fprintf(fp, ", Coulomb %.e", -ic->c_rf);
2078         }
2079         else if (EEL_PME(ic->eeltype))
2080         {
2081             fprintf(fp, ", Ewald %.3e", -ic->sh_ewald);
2082         }
2083         fprintf(fp, "\n");
2084     }
2085
2086     *interaction_const = ic;
2087
2088     if (fr->nbv != NULL && fr->nbv->bUseGPU)
2089     {
2090         nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
2091
2092         /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
2093          * also sharing texture references. To keep the code simple, we don't
2094          * treat texture references as shared resources, but this means that
2095          * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
2096          * Hence, to ensure that the non-bonded kernels don't start before all
2097          * texture binding operations are finished, we need to wait for all ranks
2098          * to arrive here before continuing.
2099          *
2100          * Note that we could omit this barrier if GPUs are not shared (or
2101          * texture objects are used), but as this is initialization code, there
2102          * is not point in complicating things.
2103          */
2104 #ifdef GMX_THREAD_MPI
2105         if (PAR(cr))
2106         {
2107             gmx_barrier(cr);
2108         }
2109 #endif  /* GMX_THREAD_MPI */
2110     }
2111
2112     bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
2113     init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
2114 }
2115
2116 static void init_nb_verlet(FILE                *fp,
2117                            nonbonded_verlet_t **nb_verlet,
2118                            gmx_bool             bFEP_NonBonded,
2119                            const t_inputrec    *ir,
2120                            const t_forcerec    *fr,
2121                            const t_commrec     *cr,
2122                            const char          *nbpu_opt)
2123 {
2124     nonbonded_verlet_t *nbv;
2125     int                 i;
2126     char               *env;
2127     gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
2128
2129     nbnxn_alloc_t      *nb_alloc;
2130     nbnxn_free_t       *nb_free;
2131
2132     snew(nbv, 1);
2133
2134     pick_nbnxn_resources(cr, fr->hwinfo,
2135                          fr->bNonbonded,
2136                          &nbv->bUseGPU,
2137                          &bEmulateGPU,
2138                          fr->gpu_opt);
2139
2140     nbv->nbs = NULL;
2141
2142     nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
2143     for (i = 0; i < nbv->ngrp; i++)
2144     {
2145         nbv->grp[i].nbl_lists.nnbl = 0;
2146         nbv->grp[i].nbat           = NULL;
2147         nbv->grp[i].kernel_type    = nbnxnkNotSet;
2148
2149         if (i == 0) /* local */
2150         {
2151             pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
2152                               nbv->bUseGPU, bEmulateGPU, ir,
2153                               &nbv->grp[i].kernel_type,
2154                               &nbv->grp[i].ewald_excl,
2155                               fr->bNonbonded);
2156         }
2157         else /* non-local */
2158         {
2159             if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
2160             {
2161                 /* Use GPU for local, select a CPU kernel for non-local */
2162                 pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
2163                                   FALSE, FALSE, ir,
2164                                   &nbv->grp[i].kernel_type,
2165                                   &nbv->grp[i].ewald_excl,
2166                                   fr->bNonbonded);
2167
2168                 bHybridGPURun = TRUE;
2169             }
2170             else
2171             {
2172                 /* Use the same kernel for local and non-local interactions */
2173                 nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
2174                 nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
2175             }
2176         }
2177     }
2178
2179     if (nbv->bUseGPU)
2180     {
2181         /* init the NxN GPU data; the last argument tells whether we'll have
2182          * both local and non-local NB calculation on GPU */
2183         nbnxn_cuda_init(fp, &nbv->cu_nbv,
2184                         &fr->hwinfo->gpu_info, fr->gpu_opt,
2185                         cr->rank_pp_intranode,
2186                         (nbv->ngrp > 1) && !bHybridGPURun);
2187
2188         if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
2189         {
2190             char *end;
2191
2192             nbv->min_ci_balanced = strtol(env, &end, 10);
2193             if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
2194             {
2195                 gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
2196             }
2197
2198             if (debug)
2199             {
2200                 fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
2201                         nbv->min_ci_balanced);
2202             }
2203         }
2204         else
2205         {
2206             nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
2207             if (debug)
2208             {
2209                 fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
2210                         nbv->min_ci_balanced);
2211             }
2212         }
2213     }
2214     else
2215     {
2216         nbv->min_ci_balanced = 0;
2217     }
2218
2219     *nb_verlet = nbv;
2220
2221     nbnxn_init_search(&nbv->nbs,
2222                       DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
2223                       DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
2224                       bFEP_NonBonded,
2225                       gmx_omp_nthreads_get(emntNonbonded));
2226
2227     for (i = 0; i < nbv->ngrp; i++)
2228     {
2229         if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
2230         {
2231             nb_alloc = &pmalloc;
2232             nb_free  = &pfree;
2233         }
2234         else
2235         {
2236             nb_alloc = NULL;
2237             nb_free  = NULL;
2238         }
2239
2240         nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
2241                                 nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
2242                                 /* 8x8x8 "non-simple" lists are ATM always combined */
2243                                 !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
2244                                 nb_alloc, nb_free);
2245
2246         if (i == 0 ||
2247             nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
2248         {
2249             gmx_bool bSimpleList;
2250             int      enbnxninitcombrule;
2251
2252             bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
2253
2254             if (bSimpleList && (fr->vdwtype == evdwCUT && (fr->vdw_modifier == eintmodNONE || fr->vdw_modifier == eintmodPOTSHIFT)))
2255             {
2256                 /* Plain LJ cut-off: we can optimize with combination rules */
2257                 enbnxninitcombrule = enbnxninitcombruleDETECT;
2258             }
2259             else if (fr->vdwtype == evdwPME)
2260             {
2261                 /* LJ-PME: we need to use a combination rule for the grid */
2262                 if (fr->ljpme_combination_rule == eljpmeGEOM)
2263                 {
2264                     enbnxninitcombrule = enbnxninitcombruleGEOM;
2265                 }
2266                 else
2267                 {
2268                     enbnxninitcombrule = enbnxninitcombruleLB;
2269                 }
2270             }
2271             else
2272             {
2273                 /* We use a full combination matrix: no rule required */
2274                 enbnxninitcombrule = enbnxninitcombruleNONE;
2275             }
2276
2277
2278             snew(nbv->grp[i].nbat, 1);
2279             nbnxn_atomdata_init(fp,
2280                                 nbv->grp[i].nbat,
2281                                 nbv->grp[i].kernel_type,
2282                                 enbnxninitcombrule,
2283                                 fr->ntype, fr->nbfp,
2284                                 ir->opts.ngener,
2285                                 bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
2286                                 nb_alloc, nb_free);
2287         }
2288         else
2289         {
2290             nbv->grp[i].nbat = nbv->grp[0].nbat;
2291         }
2292     }
2293 }
2294
2295 void init_forcerec(FILE              *fp,
2296                    const output_env_t oenv,
2297                    t_forcerec        *fr,
2298                    t_fcdata          *fcd,
2299                    const t_inputrec  *ir,
2300                    const gmx_mtop_t  *mtop,
2301                    const t_commrec   *cr,
2302                    matrix             box,
2303                    const char        *tabfn,
2304                    const char        *tabafn,
2305                    const char        *tabpfn,
2306                    const char        *tabbfn,
2307                    const char        *nbpu_opt,
2308                    gmx_bool           bNoSolvOpt,
2309                    real               print_force)
2310 {
2311     int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
2312     real           rtab;
2313     char          *env;
2314     double         dbl;
2315     const t_block *cgs;
2316     gmx_bool       bGenericKernelOnly;
2317     gmx_bool       bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
2318     gmx_bool       bFEP_NonBonded;
2319     t_nblists     *nbl;
2320     int           *nm_ind, egp_flags;
2321
2322     if (fr->hwinfo == NULL)
2323     {
2324         /* Detect hardware, gather information.
2325          * In mdrun, hwinfo has already been set before calling init_forcerec.
2326          * Here we ignore GPUs, as tools will not use them anyhow.
2327          */
2328         fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
2329     }
2330
2331     /* By default we turn SIMD kernels on, but it might be turned off further down... */
2332     fr->use_simd_kernels = TRUE;
2333
2334     fr->bDomDec = DOMAINDECOMP(cr);
2335
2336     natoms = mtop->natoms;
2337
2338     if (check_box(ir->ePBC, box))
2339     {
2340         gmx_fatal(FARGS, check_box(ir->ePBC, box));
2341     }
2342
2343     /* Test particle insertion ? */
2344     if (EI_TPI(ir->eI))
2345     {
2346         /* Set to the size of the molecule to be inserted (the last one) */
2347         /* Because of old style topologies, we have to use the last cg
2348          * instead of the last molecule type.
2349          */
2350         cgs       = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
2351         fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
2352         if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
2353         {
2354             gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
2355         }
2356     }
2357     else
2358     {
2359         fr->n_tpi = 0;
2360     }
2361
2362     /* Copy AdResS parameters */
2363     if (ir->bAdress)
2364     {
2365         fr->adress_type           = ir->adress->type;
2366         fr->adress_const_wf       = ir->adress->const_wf;
2367         fr->adress_ex_width       = ir->adress->ex_width;
2368         fr->adress_hy_width       = ir->adress->hy_width;
2369         fr->adress_icor           = ir->adress->icor;
2370         fr->adress_site           = ir->adress->site;
2371         fr->adress_ex_forcecap    = ir->adress->ex_forcecap;
2372         fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
2373
2374
2375         snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
2376         for (i = 0; i < ir->adress->n_energy_grps; i++)
2377         {
2378             fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
2379         }
2380
2381         fr->n_adress_tf_grps = ir->adress->n_tf_grps;
2382         snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
2383         for (i = 0; i < fr->n_adress_tf_grps; i++)
2384         {
2385             fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
2386         }
2387         copy_rvec(ir->adress->refs, fr->adress_refs);
2388     }
2389     else
2390     {
2391         fr->adress_type           = eAdressOff;
2392         fr->adress_do_hybridpairs = FALSE;
2393     }
2394
2395     /* Copy the user determined parameters */
2396     fr->userint1  = ir->userint1;
2397     fr->userint2  = ir->userint2;
2398     fr->userint3  = ir->userint3;
2399     fr->userint4  = ir->userint4;
2400     fr->userreal1 = ir->userreal1;
2401     fr->userreal2 = ir->userreal2;
2402     fr->userreal3 = ir->userreal3;
2403     fr->userreal4 = ir->userreal4;
2404
2405     /* Shell stuff */
2406     fr->fc_stepsize = ir->fc_stepsize;
2407
2408     /* Free energy */
2409     fr->efep        = ir->efep;
2410     fr->sc_alphavdw = ir->fepvals->sc_alpha;
2411     if (ir->fepvals->bScCoul)
2412     {
2413         fr->sc_alphacoul  = ir->fepvals->sc_alpha;
2414         fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
2415     }
2416     else
2417     {
2418         fr->sc_alphacoul  = 0;
2419         fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
2420     }
2421     fr->sc_power      = ir->fepvals->sc_power;
2422     fr->sc_r_power    = ir->fepvals->sc_r_power;
2423     fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
2424
2425     env = getenv("GMX_SCSIGMA_MIN");
2426     if (env != NULL)
2427     {
2428         dbl = 0;
2429         sscanf(env, "%lf", &dbl);
2430         fr->sc_sigma6_min = pow(dbl, 6);
2431         if (fp)
2432         {
2433             fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
2434         }
2435     }
2436
2437     fr->bNonbonded = TRUE;
2438     if (getenv("GMX_NO_NONBONDED") != NULL)
2439     {
2440         /* turn off non-bonded calculations */
2441         fr->bNonbonded = FALSE;
2442         md_print_warn(cr, fp,
2443                       "Found environment variable GMX_NO_NONBONDED.\n"
2444                       "Disabling nonbonded calculations.\n");
2445     }
2446
2447     bGenericKernelOnly = FALSE;
2448
2449     /* We now check in the NS code whether a particular combination of interactions
2450      * can be used with water optimization, and disable it if that is not the case.
2451      */
2452
2453     if (getenv("GMX_NB_GENERIC") != NULL)
2454     {
2455         if (fp != NULL)
2456         {
2457             fprintf(fp,
2458                     "Found environment variable GMX_NB_GENERIC.\n"
2459                     "Disabling all interaction-specific nonbonded kernels, will only\n"
2460                     "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
2461         }
2462         bGenericKernelOnly = TRUE;
2463     }
2464
2465     if (bGenericKernelOnly == TRUE)
2466     {
2467         bNoSolvOpt         = TRUE;
2468     }
2469
2470     if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
2471     {
2472         fr->use_simd_kernels = FALSE;
2473         if (fp != NULL)
2474         {
2475             fprintf(fp,
2476                     "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
2477                     "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
2478         }
2479     }
2480
2481     fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
2482
2483     /* Check if we can/should do all-vs-all kernels */
2484     fr->bAllvsAll       = can_use_allvsall(ir, FALSE, NULL, NULL);
2485     fr->AllvsAll_work   = NULL;
2486     fr->AllvsAll_workgb = NULL;
2487
2488     /* All-vs-all kernels have not been implemented in 4.6, and
2489      * the SIMD group kernels are also buggy in this case. Non-SIMD
2490      * group kernels are OK. See Redmine #1249. */
2491     if (fr->bAllvsAll)
2492     {
2493         fr->bAllvsAll            = FALSE;
2494         fr->use_simd_kernels     = FALSE;
2495         if (fp != NULL)
2496         {
2497             fprintf(fp,
2498                     "\nYour simulation settings would have triggered the efficient all-vs-all\n"
2499                     "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
2500                     "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
2501                     "of an unfixed bug. The reference C kernels are correct, though, so\n"
2502                     "we are proceeding by disabling all CPU architecture-specific\n"
2503                     "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
2504                     "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
2505         }
2506     }
2507
2508     /* Neighbour searching stuff */
2509     fr->cutoff_scheme = ir->cutoff_scheme;
2510     fr->bGrid         = (ir->ns_type == ensGRID);
2511     fr->ePBC          = ir->ePBC;
2512
2513     if (fr->cutoff_scheme == ecutsGROUP)
2514     {
2515         const char *note = "NOTE: This file uses the deprecated 'group' cutoff_scheme. This will be\n"
2516             "removed in a future release when 'verlet' supports all interaction forms.\n";
2517
2518         if (MASTER(cr))
2519         {
2520             fprintf(stderr, "\n%s\n", note);
2521         }
2522         if (fp != NULL)
2523         {
2524             fprintf(fp, "\n%s\n", note);
2525         }
2526     }
2527
2528     /* Determine if we will do PBC for distances in bonded interactions */
2529     if (fr->ePBC == epbcNONE)
2530     {
2531         fr->bMolPBC = FALSE;
2532     }
2533     else
2534     {
2535         if (!DOMAINDECOMP(cr))
2536         {
2537             /* The group cut-off scheme and SHAKE assume charge groups
2538              * are whole, but not using molpbc is faster in most cases.
2539              */
2540             if (fr->cutoff_scheme == ecutsGROUP ||
2541                 (ir->eConstrAlg == econtSHAKE &&
2542                  (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
2543                   gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
2544             {
2545                 fr->bMolPBC = ir->bPeriodicMols;
2546             }
2547             else
2548             {
2549                 fr->bMolPBC = TRUE;
2550                 if (getenv("GMX_USE_GRAPH") != NULL)
2551                 {
2552                     fr->bMolPBC = FALSE;
2553                     if (fp)
2554                     {
2555                         fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
2556                     }
2557                 }
2558             }
2559         }
2560         else
2561         {
2562             fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
2563         }
2564     }
2565     fr->bGB = (ir->implicit_solvent == eisGBSA);
2566
2567     fr->rc_scaling = ir->refcoord_scaling;
2568     copy_rvec(ir->posres_com, fr->posres_com);
2569     copy_rvec(ir->posres_comB, fr->posres_comB);
2570     fr->rlist                    = cutoff_inf(ir->rlist);
2571     fr->rlistlong                = cutoff_inf(ir->rlistlong);
2572     fr->eeltype                  = ir->coulombtype;
2573     fr->vdwtype                  = ir->vdwtype;
2574     fr->ljpme_combination_rule   = ir->ljpme_combination_rule;
2575
2576     fr->coulomb_modifier = ir->coulomb_modifier;
2577     fr->vdw_modifier     = ir->vdw_modifier;
2578
2579     /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
2580     switch (fr->eeltype)
2581     {
2582         case eelCUT:
2583             fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
2584             break;
2585
2586         case eelRF:
2587         case eelGRF:
2588         case eelRF_NEC:
2589             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
2590             break;
2591
2592         case eelRF_ZERO:
2593             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
2594             fr->coulomb_modifier          = eintmodEXACTCUTOFF;
2595             break;
2596
2597         case eelSWITCH:
2598         case eelSHIFT:
2599         case eelUSER:
2600         case eelENCADSHIFT:
2601         case eelPMESWITCH:
2602         case eelPMEUSER:
2603         case eelPMEUSERSWITCH:
2604             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
2605             break;
2606
2607         case eelPME:
2608         case eelEWALD:
2609             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
2610             break;
2611
2612         default:
2613             gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
2614             break;
2615     }
2616
2617     /* Vdw: Translate from mdp settings to kernel format */
2618     switch (fr->vdwtype)
2619     {
2620         case evdwCUT:
2621             if (fr->bBHAM)
2622             {
2623                 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
2624             }
2625             else
2626             {
2627                 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
2628             }
2629             break;
2630         case evdwPME:
2631             fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LJEWALD;
2632             break;
2633
2634         case evdwSWITCH:
2635         case evdwSHIFT:
2636         case evdwUSER:
2637         case evdwENCADSHIFT:
2638             fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
2639             break;
2640
2641         default:
2642             gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
2643             break;
2644     }
2645
2646     /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
2647     fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
2648     fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
2649
2650     fr->bTwinRange = fr->rlistlong > fr->rlist;
2651     fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
2652
2653     fr->reppow     = mtop->ffparams.reppow;
2654
2655     if (ir->cutoff_scheme == ecutsGROUP)
2656     {
2657         fr->bvdwtab    = ((fr->vdwtype != evdwCUT || !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
2658                           && !EVDW_PME(fr->vdwtype));
2659         /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
2660         fr->bcoultab   = !(fr->eeltype == eelCUT ||
2661                            fr->eeltype == eelEWALD ||
2662                            fr->eeltype == eelPME ||
2663                            fr->eeltype == eelRF ||
2664                            fr->eeltype == eelRF_ZERO);
2665
2666         /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
2667          * going to be faster to tabulate the interaction than calling the generic kernel.
2668          */
2669         if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
2670         {
2671             if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
2672             {
2673                 fr->bcoultab = TRUE;
2674             }
2675         }
2676         else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
2677                  ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
2678                    fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
2679                    (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
2680         {
2681             if (fr->rcoulomb != fr->rvdw)
2682             {
2683                 fr->bcoultab = TRUE;
2684             }
2685         }
2686
2687         if (getenv("GMX_REQUIRE_TABLES"))
2688         {
2689             fr->bvdwtab  = TRUE;
2690             fr->bcoultab = TRUE;
2691         }
2692
2693         if (fp)
2694         {
2695             fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
2696             fprintf(fp, "Table routines are used for vdw:     %s\n", bool_names[fr->bvdwtab ]);
2697         }
2698
2699         if (fr->bvdwtab == TRUE)
2700         {
2701             fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
2702             fr->nbkernel_vdw_modifier    = eintmodNONE;
2703         }
2704         if (fr->bcoultab == TRUE)
2705         {
2706             fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
2707             fr->nbkernel_elec_modifier    = eintmodNONE;
2708         }
2709     }
2710
2711     if (ir->cutoff_scheme == ecutsVERLET)
2712     {
2713         if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
2714         {
2715             gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
2716         }
2717         fr->bvdwtab  = FALSE;
2718         fr->bcoultab = FALSE;
2719     }
2720
2721     /* Tables are used for direct ewald sum */
2722     if (fr->bEwald)
2723     {
2724         if (EEL_PME(ir->coulombtype))
2725         {
2726             if (fp)
2727             {
2728                 fprintf(fp, "Will do PME sum in reciprocal space for electrostatic interactions.\n");
2729             }
2730             if (ir->coulombtype == eelP3M_AD)
2731             {
2732                 please_cite(fp, "Hockney1988");
2733                 please_cite(fp, "Ballenegger2012");
2734             }
2735             else
2736             {
2737                 please_cite(fp, "Essmann95a");
2738             }
2739
2740             if (ir->ewald_geometry == eewg3DC)
2741             {
2742                 if (fp)
2743                 {
2744                     fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
2745                 }
2746                 please_cite(fp, "In-Chul99a");
2747             }
2748         }
2749         fr->ewaldcoeff_q = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol);
2750         init_ewald_tab(&(fr->ewald_table), ir, fp);
2751         if (fp)
2752         {
2753             fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
2754                     1/fr->ewaldcoeff_q);
2755         }
2756     }
2757
2758     if (EVDW_PME(ir->vdwtype))
2759     {
2760         if (fp)
2761         {
2762             fprintf(fp, "Will do PME sum in reciprocal space for LJ dispersion interactions.\n");
2763         }
2764         please_cite(fp, "Essmann95a");
2765         fr->ewaldcoeff_lj = calc_ewaldcoeff_lj(ir->rvdw, ir->ewald_rtol_lj);
2766         if (fp)
2767         {
2768             fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for LJ Ewald\n",
2769                     1/fr->ewaldcoeff_lj);
2770         }
2771     }
2772
2773     /* Electrostatics */
2774     fr->epsilon_r       = ir->epsilon_r;
2775     fr->epsilon_rf      = ir->epsilon_rf;
2776     fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
2777     fr->rcoulomb_switch = ir->rcoulomb_switch;
2778     fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
2779
2780     /* Parameters for generalized RF */
2781     fr->zsquare = 0.0;
2782     fr->temp    = 0.0;
2783
2784     if (fr->eeltype == eelGRF)
2785     {
2786         init_generalized_rf(fp, mtop, ir, fr);
2787     }
2788
2789     fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype) ||
2790                        gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
2791                        gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
2792                        IR_ELEC_FIELD(*ir) ||
2793                        (fr->adress_icor != eAdressICOff)
2794                        );
2795
2796     if (fr->cutoff_scheme == ecutsGROUP &&
2797         ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
2798     {
2799         /* Count the total number of charge groups */
2800         fr->cg_nalloc = ncg_mtop(mtop);
2801         srenew(fr->cg_cm, fr->cg_nalloc);
2802     }
2803     if (fr->shift_vec == NULL)
2804     {
2805         snew(fr->shift_vec, SHIFTS);
2806     }
2807
2808     if (fr->fshift == NULL)
2809     {
2810         snew(fr->fshift, SHIFTS);
2811     }
2812
2813     if (fr->nbfp == NULL)
2814     {
2815         fr->ntype = mtop->ffparams.atnr;
2816         fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
2817         if (EVDW_PME(fr->vdwtype))
2818         {
2819             fr->ljpme_c6grid  = make_ljpme_c6grid(&mtop->ffparams, fr);
2820         }
2821     }
2822
2823     /* Copy the energy group exclusions */
2824     fr->egp_flags = ir->opts.egp_flags;
2825
2826     /* Van der Waals stuff */
2827     fr->rvdw        = cutoff_inf(ir->rvdw);
2828     fr->rvdw_switch = ir->rvdw_switch;
2829     if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
2830     {
2831         if (fr->rvdw_switch >= fr->rvdw)
2832         {
2833             gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
2834                       fr->rvdw_switch, fr->rvdw);
2835         }
2836         if (fp)
2837         {
2838             fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
2839                     (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
2840                     fr->rvdw_switch, fr->rvdw);
2841         }
2842     }
2843
2844     if (fr->bBHAM && EVDW_PME(fr->vdwtype))
2845     {
2846         gmx_fatal(FARGS, "LJ PME not supported with Buckingham");
2847     }
2848
2849     if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
2850     {
2851         gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
2852     }
2853
2854     if (fp)
2855     {
2856         fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
2857                 fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
2858     }
2859
2860     fr->eDispCorr = ir->eDispCorr;
2861     if (ir->eDispCorr != edispcNO)
2862     {
2863         set_avcsixtwelve(fp, fr, mtop);
2864     }
2865
2866     if (fr->bBHAM)
2867     {
2868         set_bham_b_max(fp, fr, mtop);
2869     }
2870
2871     fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
2872
2873     /* Copy the GBSA data (radius, volume and surftens for each
2874      * atomtype) from the topology atomtype section to forcerec.
2875      */
2876     snew(fr->atype_radius, fr->ntype);
2877     snew(fr->atype_vol, fr->ntype);
2878     snew(fr->atype_surftens, fr->ntype);
2879     snew(fr->atype_gb_radius, fr->ntype);
2880     snew(fr->atype_S_hct, fr->ntype);
2881
2882     if (mtop->atomtypes.nr > 0)
2883     {
2884         for (i = 0; i < fr->ntype; i++)
2885         {
2886             fr->atype_radius[i] = mtop->atomtypes.radius[i];
2887         }
2888         for (i = 0; i < fr->ntype; i++)
2889         {
2890             fr->atype_vol[i] = mtop->atomtypes.vol[i];
2891         }
2892         for (i = 0; i < fr->ntype; i++)
2893         {
2894             fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
2895         }
2896         for (i = 0; i < fr->ntype; i++)
2897         {
2898             fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
2899         }
2900         for (i = 0; i < fr->ntype; i++)
2901         {
2902             fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
2903         }
2904     }
2905
2906     /* Generate the GB table if needed */
2907     if (fr->bGB)
2908     {
2909 #ifdef GMX_DOUBLE
2910         fr->gbtabscale = 2000;
2911 #else
2912         fr->gbtabscale = 500;
2913 #endif
2914
2915         fr->gbtabr = 100;
2916         fr->gbtab  = make_gb_table(oenv, fr);
2917
2918         init_gb(&fr->born, fr, ir, mtop, ir->gb_algorithm);
2919
2920         /* Copy local gb data (for dd, this is done in dd_partition_system) */
2921         if (!DOMAINDECOMP(cr))
2922         {
2923             make_local_gb(cr, fr->born, ir->gb_algorithm);
2924         }
2925     }
2926
2927     /* Set the charge scaling */
2928     if (fr->epsilon_r != 0)
2929     {
2930         fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
2931     }
2932     else
2933     {
2934         /* eps = 0 is infinite dieletric: no coulomb interactions */
2935         fr->epsfac = 0;
2936     }
2937
2938     /* Reaction field constants */
2939     if (EEL_RF(fr->eeltype))
2940     {
2941         calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
2942                    fr->rcoulomb, fr->temp, fr->zsquare, box,
2943                    &fr->kappa, &fr->k_rf, &fr->c_rf);
2944     }
2945
2946     /*This now calculates sum for q and c6*/
2947     set_chargesum(fp, fr, mtop);
2948
2949     /* if we are using LR electrostatics, and they are tabulated,
2950      * the tables will contain modified coulomb interactions.
2951      * Since we want to use the non-shifted ones for 1-4
2952      * coulombic interactions, we must have an extra set of tables.
2953      */
2954
2955     /* Construct tables.
2956      * A little unnecessary to make both vdw and coul tables sometimes,
2957      * but what the heck... */
2958
2959     bMakeTables = fr->bcoultab || fr->bvdwtab || fr->bEwald ||
2960         (ir->eDispCorr != edispcNO && ir_vdw_switched(ir));
2961
2962     bMakeSeparate14Table = ((!bMakeTables || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
2963                              fr->bBHAM || fr->bEwald) &&
2964                             (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
2965                              gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
2966                              gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
2967
2968     negp_pp   = ir->opts.ngener - ir->nwall;
2969     negptable = 0;
2970     if (!bMakeTables)
2971     {
2972         bSomeNormalNbListsAreInUse = TRUE;
2973         fr->nnblists               = 1;
2974     }
2975     else
2976     {
2977         bSomeNormalNbListsAreInUse = (ir->eDispCorr != edispcNO);
2978         for (egi = 0; egi < negp_pp; egi++)
2979         {
2980             for (egj = egi; egj < negp_pp; egj++)
2981             {
2982                 egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
2983                 if (!(egp_flags & EGP_EXCL))
2984                 {
2985                     if (egp_flags & EGP_TABLE)
2986                     {
2987                         negptable++;
2988                     }
2989                     else
2990                     {
2991                         bSomeNormalNbListsAreInUse = TRUE;
2992                     }
2993                 }
2994             }
2995         }
2996         if (bSomeNormalNbListsAreInUse)
2997         {
2998             fr->nnblists = negptable + 1;
2999         }
3000         else
3001         {
3002             fr->nnblists = negptable;
3003         }
3004         if (fr->nnblists > 1)
3005         {
3006             snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
3007         }
3008     }
3009
3010     if (ir->adress)
3011     {
3012         fr->nnblists *= 2;
3013     }
3014
3015     snew(fr->nblists, fr->nnblists);
3016
3017     /* This code automatically gives table length tabext without cut-off's,
3018      * in that case grompp should already have checked that we do not need
3019      * normal tables and we only generate tables for 1-4 interactions.
3020      */
3021     rtab = ir->rlistlong + ir->tabext;
3022
3023     if (bMakeTables)
3024     {
3025         /* make tables for ordinary interactions */
3026         if (bSomeNormalNbListsAreInUse)
3027         {
3028             make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
3029             if (ir->adress)
3030             {
3031                 make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
3032             }
3033             if (!bMakeSeparate14Table)
3034             {
3035                 fr->tab14 = fr->nblists[0].table_elec_vdw;
3036             }
3037             m = 1;
3038         }
3039         else
3040         {
3041             m = 0;
3042         }
3043         if (negptable > 0)
3044         {
3045             /* Read the special tables for certain energy group pairs */
3046             nm_ind = mtop->groups.grps[egcENER].nm_ind;
3047             for (egi = 0; egi < negp_pp; egi++)
3048             {
3049                 for (egj = egi; egj < negp_pp; egj++)
3050                 {
3051                     egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
3052                     if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
3053                     {
3054                         nbl = &(fr->nblists[m]);
3055                         if (fr->nnblists > 1)
3056                         {
3057                             fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
3058                         }
3059                         /* Read the table file with the two energy groups names appended */
3060                         make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
3061                                         *mtop->groups.grpname[nm_ind[egi]],
3062                                         *mtop->groups.grpname[nm_ind[egj]],
3063                                         &fr->nblists[m]);
3064                         if (ir->adress)
3065                         {
3066                             make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
3067                                             *mtop->groups.grpname[nm_ind[egi]],
3068                                             *mtop->groups.grpname[nm_ind[egj]],
3069                                             &fr->nblists[fr->nnblists/2+m]);
3070                         }
3071                         m++;
3072                     }
3073                     else if (fr->nnblists > 1)
3074                     {
3075                         fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
3076                     }
3077                 }
3078             }
3079         }
3080     }
3081     if (bMakeSeparate14Table)
3082     {
3083         /* generate extra tables with plain Coulomb for 1-4 interactions only */
3084         fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
3085                                 GMX_MAKETABLES_14ONLY);
3086     }
3087
3088     /* Read AdResS Thermo Force table if needed */
3089     if (fr->adress_icor == eAdressICThermoForce)
3090     {
3091         /* old todo replace */
3092
3093         if (ir->adress->n_tf_grps > 0)
3094         {
3095             make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
3096
3097         }
3098         else
3099         {
3100             /* load the default table */
3101             snew(fr->atf_tabs, 1);
3102             fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
3103         }
3104     }
3105
3106     /* Wall stuff */
3107     fr->nwall = ir->nwall;
3108     if (ir->nwall && ir->wall_type == ewtTABLE)
3109     {
3110         make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
3111     }
3112
3113     if (fcd && tabbfn)
3114     {
3115         fcd->bondtab  = make_bonded_tables(fp,
3116                                            F_TABBONDS, F_TABBONDSNC,
3117                                            mtop, tabbfn, "b");
3118         fcd->angletab = make_bonded_tables(fp,
3119                                            F_TABANGLES, -1,
3120                                            mtop, tabbfn, "a");
3121         fcd->dihtab   = make_bonded_tables(fp,
3122                                            F_TABDIHS, -1,
3123                                            mtop, tabbfn, "d");
3124     }
3125     else
3126     {
3127         if (debug)
3128         {
3129             fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
3130         }
3131     }
3132
3133     /* QM/MM initialization if requested
3134      */
3135     if (ir->bQMMM)
3136     {
3137         fprintf(stderr, "QM/MM calculation requested.\n");
3138     }
3139
3140     fr->bQMMM      = ir->bQMMM;
3141     fr->qr         = mk_QMMMrec();
3142
3143     /* Set all the static charge group info */
3144     fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
3145                                    &bFEP_NonBonded,
3146                                    &fr->bExcl_IntraCGAll_InterCGNone);
3147     if (DOMAINDECOMP(cr))
3148     {
3149         fr->cginfo = NULL;
3150     }
3151     else
3152     {
3153         fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
3154     }
3155
3156     if (!DOMAINDECOMP(cr))
3157     {
3158         forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
3159                             mtop->natoms, mtop->natoms, mtop->natoms);
3160     }
3161
3162     fr->print_force = print_force;
3163
3164
3165     /* coarse load balancing vars */
3166     fr->t_fnbf    = 0.;
3167     fr->t_wait    = 0.;
3168     fr->timesteps = 0;
3169
3170     /* Initialize neighbor search */
3171     init_ns(fp, cr, &fr->ns, fr, mtop);
3172
3173     if (cr->duty & DUTY_PP)
3174     {
3175         gmx_nonbonded_setup(fr, bGenericKernelOnly);
3176         /*
3177            if (ir->bAdress)
3178             {
3179                 gmx_setup_adress_kernels(fp,bGenericKernelOnly);
3180             }
3181          */
3182     }
3183
3184     /* Initialize the thread working data for bonded interactions */
3185     init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
3186
3187     snew(fr->excl_load, fr->nthreads+1);
3188
3189     if (fr->cutoff_scheme == ecutsVERLET)
3190     {
3191         if (ir->rcoulomb != ir->rvdw)
3192         {
3193             gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
3194         }
3195
3196         init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
3197     }
3198
3199     /* fr->ic is used both by verlet and group kernels (to some extent) now */
3200     init_interaction_const(fp, cr, &fr->ic, fr, rtab);
3201
3202     if (ir->eDispCorr != edispcNO)
3203     {
3204         calc_enervirdiff(fp, ir->eDispCorr, fr);
3205     }
3206 }
3207
3208 #define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
3209 #define pr_int(fp, i)  fprintf((fp), "%s: %d\n",#i, i)
3210 #define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
3211
3212 void pr_forcerec(FILE *fp, t_forcerec *fr)
3213 {
3214     int i;
3215
3216     pr_real(fp, fr->rlist);
3217     pr_real(fp, fr->rcoulomb);
3218     pr_real(fp, fr->fudgeQQ);
3219     pr_bool(fp, fr->bGrid);
3220     pr_bool(fp, fr->bTwinRange);
3221     /*pr_int(fp,fr->cg0);
3222        pr_int(fp,fr->hcg);*/
3223     for (i = 0; i < fr->nnblists; i++)
3224     {
3225         pr_int(fp, fr->nblists[i].table_elec_vdw.n);
3226     }
3227     pr_real(fp, fr->rcoulomb_switch);
3228     pr_real(fp, fr->rcoulomb);
3229
3230     fflush(fp);
3231 }
3232
3233 void forcerec_set_excl_load(t_forcerec           *fr,
3234                             const gmx_localtop_t *top)
3235 {
3236     const int *ind, *a;
3237     int        t, i, j, ntot, n, ntarget;
3238
3239     ind = top->excls.index;
3240     a   = top->excls.a;
3241
3242     ntot = 0;
3243     for (i = 0; i < top->excls.nr; i++)
3244     {
3245         for (j = ind[i]; j < ind[i+1]; j++)
3246         {
3247             if (a[j] > i)
3248             {
3249                 ntot++;
3250             }
3251         }
3252     }
3253
3254     fr->excl_load[0] = 0;
3255     n                = 0;
3256     i                = 0;
3257     for (t = 1; t <= fr->nthreads; t++)
3258     {
3259         ntarget = (ntot*t)/fr->nthreads;
3260         while (i < top->excls.nr && n < ntarget)
3261         {
3262             for (j = ind[i]; j < ind[i+1]; j++)
3263             {
3264                 if (a[j] > i)
3265                 {
3266                     n++;
3267                 }
3268             }
3269             i++;
3270         }
3271         fr->excl_load[t] = i;
3272     }
3273 }